Source (Text):
from glob import glob
from sys import maxint
from re import compile, IGNORECASE
from os import remove
from os.path import exists
from time import sleep
from win32process import CreateProcess, STARTUPINFO, GetExitCodeProcess
wordpat = compile ( r"[a-z0-9]+\-?[a-z0-9]+" )
hyphenated = compile ( r'[^-]+' )
dirName = 'N:/NovaCentury/sample_pdfs/'
tempFileName = '%stemp.txt' % dirName
for i, fileName in zip ( xrange ( maxint ), glob ( '%s*.pdf' % dirName ) ) :
if exists ( tempFileName ) : remove ( tempFileName )
words = { }
theProcess, theThread, pId, tId = CreateProcess ( None, 'pdftotext "%s" temp.txt' % fileName, None, None, 0, 0, None, dirName, STARTUPINFO ( ) )
processOK = 0
for i in range ( 100 ) :
exitCode = GetExitCodeProcess ( theProcess )
if exitCode == 0 :
processOK = 1
break
elif exitCode == 1 :
break
sleep ( 0.1 )
if processOK :
print fileName
else :
print "*** %s (%s)" % ( fileName, exitCode, ),
if exitCode == 1 :
print ": possibly unreadable"
else :
print
continue
for line in open ( tempFileName ) . xreadlines ( ) :
for word in wordpat . findall ( line . lower ( ) ) :
for piece in hyphenated . findall ( word ) :
words [ piece ] = 1
if exists ( tempFileName ) : remove ( tempFileName )
keys = words . keys ( )
keys . sort ( )
textFile = file ( fileName [ : -3 ] + 'txt', 'w' )
for i, key in zip ( xrange ( maxint ), keys ) :
print >> textFile, key,
if ( i + 1 ) % 20 == 0 : print >> textFile
textFile . close ( )
from glob import glob
import sys, os
sys . path . insert ( 0, "I:/Program Files/WebSite/lib/python" )
import ZPublisher . Client
def uploadFilesSet ( url, sourceFiles, contentType ) :
object = ZPublisher . Client . Object ( url, username = 'Bill', password = 'seamus' )
for fileName in glob ( sourceFiles ) :
name = os . path . split ( fileName ) [ 1 ] . replace ( '&', ' and ' )
name = "%s%s" % ( name [ : -3 ], name [ -3 : ] . lower ( ), )
try:
object . manage_addFile ( id = name, file = open ( fileName, 'rb' ), content_type = contentType )
except ZPublisher . Client . ServerError, v:
if str ( v ) [ : 1 ] != '3' :
raise sys . exc_info ( ) [ 0 ], sys . exc_info ( ) [ 1 ], sys . exc_info ( ) [ 2 ]
print 'addFile failed for %s' % name
except :
print name
uploadFilesSet ( 'http://localhost:8080/pdfIndexing/pdfCatalog/pdfDocs', 'sample_pdfs/*.pdf', 'application/pdf' )
uploadFilesSet ( 'http://localhost:8080/pdfIndexing/pdfCatalog/pdfTexts', 'sample_pdfs/*.txt', 'text/plain' )
<!-- ZPT snippet-->
<div tal:repeat="result batch" >
<span tal:define="global theId result/id; global theURL python:'pdfDocs/'+theId[:-3]+'pdf'"></span>
<a href="replaceable" tal:attributes="href theURL"><span tal:replace="python:theId[:-4].title()">look here</span></a>
</div>
|