69 lines
2 KiB
Python
69 lines
2 KiB
Python
#!/usr/bin/python
|
|
# coding=UTF-8
|
|
|
|
import subprocess
|
|
import re
|
|
import pdb
|
|
|
|
""" pdftotext -layout -nopgbrk p1220.pdf - """
|
|
|
|
class PDFRecordFinder(object):
|
|
def __init__(self, src, heading_exp=None):
|
|
if not heading_exp:
|
|
heading_exp = re.compile('\s+Record Name: (.*)')
|
|
|
|
opts = ["pdftotext", "-layout", "-nopgbrk", src, '-']
|
|
pdftext = subprocess.check_output(opts)
|
|
self.textrows = pdftext.split('\n')
|
|
self.heading_exp = heading_exp
|
|
|
|
@property
|
|
def rows(self):
|
|
for row in self.textrows:
|
|
yield row
|
|
|
|
@property
|
|
def records(self):
|
|
row_iter = self.rows
|
|
for r in row_iter:
|
|
if self.heading_exp.match(r):
|
|
record = self.extract_record(row_iter)
|
|
yield record
|
|
|
|
def extract_record(self, row_iter):
|
|
re_multiwhite = re.compile(r'\s{2,}')
|
|
result = []
|
|
full_width_text_count = 0
|
|
|
|
for r in row_iter:
|
|
row = r.decode('UTF-8')
|
|
if not row:
|
|
continue
|
|
|
|
# IF LINE CONTAINS MULTIPLE WHITESPACE, IT IS PROBABLY A TABLE
|
|
if not re_multiwhite.search(row):
|
|
full_width_text_count += 1
|
|
#if full_width_text_count > 2:
|
|
# print 'full width text count exceeded limit'
|
|
# return result
|
|
continue
|
|
|
|
#pdb.set_trace()
|
|
row_result = []
|
|
pos = 0
|
|
while pos < len(row):
|
|
match = re_multiwhite.search(row[pos:])
|
|
if match and match.start() > 0:
|
|
row_result.append((
|
|
pos,
|
|
row[pos:pos+match.start()],))
|
|
pos += match.end()
|
|
else:
|
|
if match:
|
|
pos += match.end()
|
|
row_result.append((pos,row[pos:],'b'))
|
|
pos += len(row)
|
|
|
|
result.append(row_result)
|
|
row_result = []
|
|
return result
|