pyaccuwage/pyaccuwage/pdfextract.py

145 lines
4 KiB
Python

#!/usr/bin/python
# coding=UTF-8
import subprocess
import re
import pdb
""" pdftotext -layout -nopgbrk p1220.pdf - """
class PDFRecordFinder(object):
def __init__(self, src, heading_exp=None):
if not heading_exp:
heading_exp = re.compile('\s+Record Name: (.*)')
opts = ["pdftotext", "-layout", "-nopgbrk", src, '-']
pdftext = subprocess.check_output(opts)
self.textrows = pdftext.split('\n')
self.heading_exp = heading_exp
def records(self):
headings = self.locate_heading_rows()
for (start, end, name) in headings:
name = name.decode('ascii', 'ignore')
yield (name, list(self.find_fields(iter(self.textrows[start+1:end]))))
def locate_heading_rows(self):
results = []
for (i, row) in enumerate(self.textrows):
match = self.heading_exp.match(row)
if match:
if not ''.join(match.groups()).lower().endswith('(continued)'):
results.append((i, ''.join(match.groups())))
merged = []
for (a, b) in zip(results, results[1:] + [(len(self.textrows),None)]):
merged.append( (a[0], b[0]-1, a[1]) )
return merged
def find_fields(self, row_iter):
cc = ColumnCollector()
for r in row_iter:
row = self.extract_columns_from_row(r.decode('UTF-8'))
if not row:
continue
if cc.is_next_field(row):
yield cc
cc = ColumnCollector()
try:
cc.add(row)
except UnknownColumn, e:
raise StopIteration
yield cc
def extract_columns_from_row(self, row):
re_multiwhite = re.compile(r'\s{2,}')
# IF LINE DOESN'T CONTAIN MULTIPLE WHITESPACES, IT'S LIKELY NOT A TABLE
if not re_multiwhite.search(row):
return None
white_ranges = [0,]
pos = 0
while pos < len(row):
match = re_multiwhite.search(row[pos:])
if match:
white_ranges.append(pos + match.start())
white_ranges.append(pos + match.end())
pos += match.end()
else:
white_ranges.append(len(row))
pos = len(row)
row_result = []
white_iter = iter(white_ranges)
while white_iter:
try:
start = white_iter.next()
end = white_iter.next()
if start != end:
row_result.append(
(start, row[start:end].encode('ascii','ignore'))
)
except StopIteration:
white_iter = None
return row_result
class UnknownColumn(Exception):
pass
class IsNextField(Exception):
pass
class ColumnCollector(object):
def __init__(self, initial=None):
self.data = None
pass
def add(self, data):
if not self.data:
self.data = dict(data)
else:
if self.is_next_field(data):
raise IsNextField()
for col_id, value in data:
self.merge_column(col_id, value)
def merge_column(self, col_id, value):
if col_id in self.data.keys():
self.data[col_id] += ' ' + value.strip()
else:
# try adding a wiggle room value?
raise UnknownColumn
def is_next_field(self, data):
"""
If the first key value contains a string
and we already have some data in the record,
then this row is probably the beginning of
the next field. Raise an exception and continue
on with a fresh ColumnCollector.
"""
first_key = dict(data).keys()[0]
if self.data:
return self.data.keys()[0] == first_key
return False
@property
def tuple(self):
return tuple(map(lambda k:self.data[k], sorted(self.data.keys())))