Merge branch 'master' of brimstone.klowner.com:pyaccuwage

Conflicts:
	pyaccuwage/pdfextract.py
This commit is contained in:
Binh 2012-12-04 14:57:20 -06:00
commit 8995f142e5
3 changed files with 205 additions and 65 deletions

View file

@ -120,6 +120,8 @@ class NumericToken(BaseToken):
class RecordBuilder(object):
import fields
entry_max_length = 4
TOKEN_TYPES = [
RangeToken,
NumericToken,
@ -176,7 +178,10 @@ class RecordBuilder(object):
def _compile(self, entries):
for entry in entries:
(f_range, f_name, f_length, f_desc) = list(entry) + ['']*(4-len(entry))
if len(entry) > self.entry_max_length:
continue
(f_range, f_name, f_length, f_desc) = list(entry) + ['']*(self.entry_max_length-len(entry))
try:
f_length = int(f_length)
@ -186,9 +191,11 @@ class RecordBuilder(object):
try:
assert f_length == RangeToken(f_range).value
except AssertionError:
import pdb
pdb.set_trace()
except AssertionError, e:
continue
except ValueError, e:
# bad result, skip
continue
name_parts = f_name.split(' ')
@ -201,11 +208,13 @@ class RecordBuilder(object):
required = None
f_name = u'_'.join(map(lambda x:x.lower(), name_parts))
f_name = f_name.replace('&', 'and')
f_name = re.sub(r'[^\w]','', f_name)
yield {
'name': f_name,
'desc': '(' + f_range + '). ' + f_desc,
'range': f_range,
'desc': f_desc,
'length': f_length,
'required': required,
}
@ -261,9 +270,7 @@ class RecordBuilder(object):
add("(" + ", ".join(args) + ")")
yield "".join(result)
yield "".join(result).ljust(85) + "# %s" % entry['range']
class PastedDefParser(RecordBuilder):

View file

@ -53,17 +53,8 @@ class PDFRecordFinder(object):
for (i, row) in enumerate(self.textrows):
match = self.heading_exp.match(row)
if match:
#print i,match.groups()
results.append((i, ''.join(match.groups())))
"""
results2 = []
for r in results:
if len(results2)==0 or results2[-1:][0][1] != r[1]:
results2.append(r)
results = results2
"""
merged = []
for (a, b) in zip(results, results[1:] + [(len(self.textrows),None)]):
merged.append( (a[0], b[0]-1, a[1]) )
@ -84,6 +75,29 @@ class PDFRecordFinder(object):
def find_fields(self, row_iter):
cc = ColumnCollector()
blank_row_counter = 0
for r in row_iter:
row = self.extract_columns_from_row(r.decode('UTF-8'))
if not row:
cc.empty_row()
continue
try:
cc.add(row)
except IsNextField, e:
yield cc
cc = ColumnCollector()
cc.add(row)
except UnknownColumn, e:
raise StopIteration
yield cc
def find_fields_old(self, row_iter):
cc = ColumnCollector()
for r in row_iter:
row = self.extract_columns_from_row(r.decode('UTF-8'))
@ -151,9 +165,47 @@ class IsNextField(Exception):
class ColumnCollector(object):
def __init__(self, initial=None):
self.data = None
self.column_widths = None
self.max_data_length = 0
self.adjust_pad = 3
self.empty_rows = 0
pass
def __repr__(self):
return "<%s: %s>" % (self.__class__.__name__, map(lambda x:x if len(x) < 25 else x[:25] + '..', self.data.values()))
def add(self, data):
if self.empty_rows > 2:
raise IsNextField()
if not self.data:
self.data = dict(data)
else:
data = self.adjust_columns(data)
if self.is_next_field(data):
raise IsNextField()
for col_id, value in data:
self.merge_column(col_id, value)
self.update_column_widths(data)
def empty_row(self):
self.empty_rows += 1
def update_column_widths(self, data):
self.last_data_length = len(data)
self.max_data_length = max(self.max_data_length, len(data))
if not self.column_widths:
self.column_widths = dict(map(lambda (column, value): [column, column + len(value)], data))
else:
for col_id, value in data:
try:
self.column_widths[col_id] = max(self.column_widths[col_id], col_id + len(value.strip()))
except KeyError:
pass
def add_old(self, data):
if not self.data:
self.data = dict(data)
else:
@ -162,10 +214,28 @@ class ColumnCollector(object):
for col_id, value in data:
self.merge_column(col_id, value)
def adjust_columns(self, data):
adjusted_data = {}
for col_id, value in data:
if col_id in self.data.keys():
adjusted_data[col_id] = value.strip()
else:
for col_start, col_end in self.column_widths.items():
if (col_start - self.adjust_pad) <= col_id and (col_end + self.adjust_pad) >= col_id:
if col_start in adjusted_data:
adjusted_data[col_start] += ' ' + value.strip()
else:
adjusted_data[col_start] = value.strip()
return adjusted_data.items()
def merge_column(self, col_id, value):
if col_id in self.data.keys():
self.data[col_id] += ' ' + value.strip()
else:
# try adding a wiggle room value?
# FIXME:
@ -175,6 +245,12 @@ class ColumnCollector(object):
# after the maximum column, and assume it's part of the
# max column?
"""
for col_start, col_end in self.column_widths.items():
if col_start <= col_id and (col_end) >= col_id:
self.data[col_start] += ' ' + value.strip()
return
"""
raise UnknownColumn
def is_next_field(self, data):
@ -185,13 +261,35 @@ class ColumnCollector(object):
the next field. Raise an exception and continue
on with a fresh ColumnCollector.
"""
first_key = dict(data).keys()[0]
if self.data:
return self.data.keys()[0] == first_key
""" If the length of the value in column_id is less than the position of the next column_id,
then this is probably a continuation.
"""
if self.data and data:
keys = dict(self.column_widths).keys()
keys.sort()
keys += [None]
if self.last_data_length < len(data):
return True
first_key, first_value = dict(data).items()[0]
if self.data.keys()[0] == first_key:
position = keys.index(first_key)
max_length = keys[position + 1]
if max_length:
return len(first_value) > max_length or len(data) == self.max_data_length
return False
@property
def tuple(self):
return tuple(map(lambda k:self.data[k], sorted(self.data.keys())))
try:
return tuple(map(lambda k:self.data[k], sorted(self.data.keys())))
except:
import pdb
pdb.set_trace()

View file

@ -32,11 +32,46 @@ doc = PDFRecordFinder(source_file)
records = doc.records()
builder = RecordBuilder()
for rec in records:
def record_begins_at(field):
return int(fields[0].data.values()[0].split('-')[0], 10)
sys.stdout.write("class %s(object):\n" % re.sub('[^\w]','',rec[0]))
def record_ends_at(fields):
return int(fields[-1].data.values()[0].split('-')[-1], 10)
last_record_begins_at = -1
last_record_ends_at = -1
for rec in records:
#if not rec[1]:
# continue # no actual fields detected
fields = rec[1]
# strip out fields that are not 4 items long
fields = filter(lambda x:len(x.tuple) == 4, fields)
# strip fields that don't begin at position 0
fields = filter(lambda x: 0 in x.data, fields)
# strip fields that don't have a length-range type item in position 0
fields = filter(lambda x: re.match('^\d+[-]?\d*$', x.data[0]), fields)
if not fields:
continue
begins_at = record_begins_at(fields)
ends_at = record_ends_at(fields)
# FIXME record_ends_at is randomly exploding due to record data being
# a lump of text and not necessarily a field entry. I assume
# this is cleaned out by the record builder class.
print last_record_ends_at + 1, begins_at
#if last_record_ends_at + 1 != begins_at:
sys.stdout.write("\nclass %s(object):\n" % re.sub('[^\w]','',rec[0]))
for field in builder.load(map(lambda x:x.tuple, rec[1][1:])):
sys.stdout.write('\t' + field + '\n')
#print field
last_record_ends_at = ends_at