You could try using subprocess
to call pdftotext
(probably with the -layout
option) from the poppler utilities. It has worked much better for me than using pypdf.
For example I've used the following code to extract CAS numbers from a PDF file:
import subprocess
import re
def findCAS(pdf, page=None):
'''Find all CAS numbers on the numbered page of a file.
Arguments:
pdf -- Name of the PDF file to search
page -- number of the page to search. if None, search all pages.
'''
if page == None:
args = ['pdftotext', '-layout', '-q', pdf, '-']
else:
args = ['pdftotext', '-f', str(page), '-l', str(page), '-layout',
'-q', pdf, '-']
txt = subprocess.check_output(args)
candidates = re.findall('\d{2,6}-\d{2}-\d{1}', txt)
checked = [x.lstrip('0') for x in candidates if checkCAS(x)]
return list(set(checked))
def checkCAS(cas):
'''Check if a string is a valid CAS number.
Arguments:
cas -- string to check
'''
nums = cas[::-1].replace('-', '') # all digits in reverse order
checksum = int(nums[0]) # first digit is the checksum
som = 0
# Checksum method from: http://nl.wikipedia.org/wiki/CAS-nummer
for n, d in enumerate(nums[1:]):
som += (n+1)*int(d)
return som % 10 == checksum