145 lines
4 KiB
Python
145 lines
4 KiB
Python
#!/usr/bin/python
|
|
# coding=UTF-8
|
|
|
|
import subprocess
|
|
import re
|
|
import pdb
|
|
|
|
""" pdftotext -layout -nopgbrk p1220.pdf - """
|
|
|
|
|
|
class PDFRecordFinder(object):
|
|
def __init__(self, src, heading_exp=None):
|
|
if not heading_exp:
|
|
heading_exp = re.compile('\s+Record Name: (.*)')
|
|
|
|
opts = ["pdftotext", "-layout", "-nopgbrk", src, '-']
|
|
pdftext = subprocess.check_output(opts)
|
|
self.textrows = pdftext.split('\n')
|
|
self.heading_exp = heading_exp
|
|
|
|
|
|
def records(self):
|
|
headings = self.locate_heading_rows()
|
|
for (start, end, name) in headings:
|
|
name = name.decode('ascii', 'ignore')
|
|
yield (name, list(self.find_fields(iter(self.textrows[start+1:end]))))
|
|
|
|
|
|
def locate_heading_rows(self):
|
|
results = []
|
|
for (i, row) in enumerate(self.textrows):
|
|
match = self.heading_exp.match(row)
|
|
if match:
|
|
if not ''.join(match.groups()).lower().endswith('(continued)'):
|
|
results.append((i, ''.join(match.groups())))
|
|
|
|
merged = []
|
|
for (a, b) in zip(results, results[1:] + [(len(self.textrows),None)]):
|
|
merged.append( (a[0], b[0]-1, a[1]) )
|
|
|
|
return merged
|
|
|
|
|
|
def find_fields(self, row_iter):
|
|
cc = ColumnCollector()
|
|
for r in row_iter:
|
|
row = self.extract_columns_from_row(r.decode('UTF-8'))
|
|
|
|
if not row:
|
|
continue
|
|
|
|
if cc.is_next_field(row):
|
|
yield cc
|
|
cc = ColumnCollector()
|
|
|
|
try:
|
|
cc.add(row)
|
|
|
|
except UnknownColumn, e:
|
|
raise StopIteration
|
|
yield cc
|
|
|
|
|
|
def extract_columns_from_row(self, row):
|
|
re_multiwhite = re.compile(r'\s{2,}')
|
|
|
|
# IF LINE DOESN'T CONTAIN MULTIPLE WHITESPACES, IT'S LIKELY NOT A TABLE
|
|
if not re_multiwhite.search(row):
|
|
return None
|
|
|
|
white_ranges = [0,]
|
|
pos = 0
|
|
while pos < len(row):
|
|
match = re_multiwhite.search(row[pos:])
|
|
if match:
|
|
white_ranges.append(pos + match.start())
|
|
white_ranges.append(pos + match.end())
|
|
pos += match.end()
|
|
else:
|
|
white_ranges.append(len(row))
|
|
pos = len(row)
|
|
|
|
row_result = []
|
|
white_iter = iter(white_ranges)
|
|
while white_iter:
|
|
try:
|
|
start = white_iter.next()
|
|
end = white_iter.next()
|
|
if start != end:
|
|
row_result.append(
|
|
(start, row[start:end].encode('ascii','ignore'))
|
|
)
|
|
|
|
except StopIteration:
|
|
white_iter = None
|
|
|
|
return row_result
|
|
|
|
|
|
class UnknownColumn(Exception):
|
|
pass
|
|
|
|
class IsNextField(Exception):
|
|
pass
|
|
|
|
class ColumnCollector(object):
|
|
def __init__(self, initial=None):
|
|
self.data = None
|
|
pass
|
|
|
|
def add(self, data):
|
|
if not self.data:
|
|
self.data = dict(data)
|
|
else:
|
|
if self.is_next_field(data):
|
|
raise IsNextField()
|
|
for col_id, value in data:
|
|
self.merge_column(col_id, value)
|
|
|
|
def merge_column(self, col_id, value):
|
|
if col_id in self.data.keys():
|
|
self.data[col_id] += ' ' + value.strip()
|
|
|
|
else:
|
|
# try adding a wiggle room value?
|
|
raise UnknownColumn
|
|
|
|
def is_next_field(self, data):
|
|
"""
|
|
If the first key value contains a string
|
|
and we already have some data in the record,
|
|
then this row is probably the beginning of
|
|
the next field. Raise an exception and continue
|
|
on with a fresh ColumnCollector.
|
|
"""
|
|
first_key = dict(data).keys()[0]
|
|
if self.data:
|
|
return self.data.keys()[0] == first_key
|
|
return False
|
|
|
|
@property
|
|
def tuple(self):
|
|
return tuple(map(lambda k:self.data[k], sorted(self.data.keys())))
|
|
|
|
|