# Python for extracting words from glob of PDFs into correspondingly # named text files from glob import glob from sys import maxint from re import compile, IGNORECASE from os import remove from os.path import exists from time import sleep from win32process import CreateProcess, STARTUPINFO, GetExitCodeProcess wordpat = compile ( r"[a-z0-9]+\-?[a-z0-9]+" ) hyphenated = compile ( r'[^-]+' ) dirName = 'N:/NovaCentury/sample_pdfs/' tempFileName = '%stemp.txt' % dirName for i, fileName in zip ( xrange ( maxint ), glob ( '%s*.pdf' % dirName ) ) : if exists ( tempFileName ) : remove ( tempFileName ) words = { } theProcess, theThread, pId, tId = CreateProcess ( None, 'pdftotext "%s" temp.txt' % fileName, None, None, 0, 0, None, dirName, STARTUPINFO ( ) ) processOK = 0 for i in range ( 100 ) : exitCode = GetExitCodeProcess ( theProcess ) if exitCode == 0 : processOK = 1 break elif exitCode == 1 : break sleep ( 0.1 ) if processOK : print fileName else : print "*** %s (%s)" % ( fileName, exitCode, ), if exitCode == 1 : print ": possibly unreadable" else : print continue for line in open ( tempFileName ) . xreadlines ( ) : for word in wordpat . findall ( line . lower ( ) ) : for piece in hyphenated . findall ( word ) : words [ piece ] = 1 if exists ( tempFileName ) : remove ( tempFileName ) keys = words . keys ( ) keys . sort ( ) textFile = file ( fileName [ : -3 ] + 'txt', 'w' ) for i, key in zip ( xrange ( maxint ), keys ) : print >> textFile, key, if ( i + 1 ) % 20 == 0 : print >> textFile textFile . close ( ) # Python for uploading PDFs and text files of extracted words # to Zope from glob import glob import sys, os sys . path . insert ( 0, "I:/Program Files/WebSite/lib/python" ) import ZPublisher . Client def uploadFilesSet ( url, sourceFiles, contentType ) : object = ZPublisher . Client . Object ( url, username = 'Bill', password = 'seamus' ) for fileName in glob ( sourceFiles ) : name = os . path . split ( fileName ) [ 1 ] . replace ( '&', ' and ' ) name = "%s%s" % ( name [ : -3 ], name [ -3 : ] . lower ( ), ) try: object . manage_addFile ( id = name, file = open ( fileName, 'rb' ), content_type = contentType ) except ZPublisher . Client . ServerError, v: if str ( v ) [ : 1 ] != '3' : raise sys . exc_info ( ) [ 0 ], sys . exc_info ( ) [ 1 ], sys . exc_info ( ) [ 2 ] print 'addFile failed for %s' % name except : print name uploadFilesSet ( 'http://localhost:8080/pdfIndexing/pdfCatalog/pdfDocs', 'sample_pdfs/*.pdf', 'application/pdf' ) uploadFilesSet ( 'http://localhost:8080/pdfIndexing/pdfCatalog/pdfTexts', 'sample_pdfs/*.txt', 'text/plain' )