Parsing all the way through the pdf appears to work. Next we need

to track the beginning/ending points for each record and append
continuation records onto the previous. There's some issue in
the pyaccuwage-pdfparse script causing it to have problems reading
the last record field in a record group. Maybe the record extractor
needs to dump the last failed ColumnCollector rather than return it
if it's determined to hold junk data?

The record builder seems to handle everything just fine.

Added a function to the field name parsing to replace ampersands
with an "and" string so as not to cause problems with variable names.
This commit is contained in:
Binh 2012-11-13 15:53:41 -06:00
parent fe4bd20bad
commit 1c7533973a
3 changed files with 74 additions and 71 deletions

View file

@ -120,6 +120,8 @@ class NumericToken(BaseToken):
class RecordBuilder(object): class RecordBuilder(object):
import fields import fields
entry_max_length = 4
TOKEN_TYPES = [ TOKEN_TYPES = [
RangeToken, RangeToken,
NumericToken, NumericToken,
@ -176,7 +178,10 @@ class RecordBuilder(object):
def _compile(self, entries): def _compile(self, entries):
for entry in entries: for entry in entries:
(f_range, f_name, f_length, f_desc) = list(entry) + ['']*(4-len(entry)) if len(entry) > self.entry_max_length:
continue
(f_range, f_name, f_length, f_desc) = list(entry) + ['']*(self.entry_max_length-len(entry))
try: try:
f_length = int(f_length) f_length = int(f_length)
@ -186,9 +191,11 @@ class RecordBuilder(object):
try: try:
assert f_length == RangeToken(f_range).value assert f_length == RangeToken(f_range).value
except AssertionError: except AssertionError, e:
import pdb continue
pdb.set_trace() except ValueError, e:
# bad result, skip
continue
name_parts = f_name.split(' ') name_parts = f_name.split(' ')
@ -201,11 +208,13 @@ class RecordBuilder(object):
required = None required = None
f_name = u'_'.join(map(lambda x:x.lower(), name_parts)) f_name = u'_'.join(map(lambda x:x.lower(), name_parts))
f_name = f_name.replace('&', 'and')
f_name = re.sub(r'[^\w]','', f_name) f_name = re.sub(r'[^\w]','', f_name)
yield { yield {
'name': f_name, 'name': f_name,
'desc': '(' + f_range + '). ' + f_desc, 'range': f_range,
'desc': f_desc,
'length': f_length, 'length': f_length,
'required': required, 'required': required,
} }
@ -261,9 +270,7 @@ class RecordBuilder(object):
add("(" + ", ".join(args) + ")") add("(" + ", ".join(args) + ")")
yield "".join(result).ljust(85) + "# %s" % entry['range']
yield "".join(result)
class PastedDefParser(RecordBuilder): class PastedDefParser(RecordBuilder):

View file

@ -33,14 +33,6 @@ class PDFRecordFinder(object):
if match: if match:
results.append((i, ''.join(match.groups()))) results.append((i, ''.join(match.groups())))
"""
results2 = []
for r in results:
if len(results2)==0 or results2[-1:][0][1] != r[1]:
results2.append(r)
results = results2
"""
merged = [] merged = []
for (a, b) in zip(results, results[1:] + [(len(self.textrows),None)]): for (a, b) in zip(results, results[1:] + [(len(self.textrows),None)]):
merged.append( (a[0], b[0]-1, a[1]) ) merged.append( (a[0], b[0]-1, a[1]) )
@ -57,7 +49,6 @@ class PDFRecordFinder(object):
if not row: if not row:
continue continue
#if cc.is_next_field(row): #if cc.is_next_field(row):
# print len(cc.data) # print len(cc.data)
# yield cc # yield cc
@ -102,8 +93,8 @@ class PDFRecordFinder(object):
re_multiwhite = re.compile(r'\s{2,}') re_multiwhite = re.compile(r'\s{2,}')
# IF LINE DOESN'T CONTAIN MULTIPLE WHITESPACES, IT'S LIKELY NOT A TABLE # IF LINE DOESN'T CONTAIN MULTIPLE WHITESPACES, IT'S LIKELY NOT A TABLE
#if not re_multiwhite.search(row): if not re_multiwhite.search(row):
# return None return None
white_ranges = [0,] white_ranges = [0,]
pos = 0 pos = 0
@ -145,6 +136,7 @@ class ColumnCollector(object):
self.data = None self.data = None
self.column_widths = None self.column_widths = None
self.max_data_length = 0 self.max_data_length = 0
self.adjust_pad = 3
pass pass
def add(self, data): def add(self, data):
@ -190,11 +182,12 @@ class ColumnCollector(object):
adjusted_data[col_id] = value.strip() adjusted_data[col_id] = value.strip()
else: else:
for col_start, col_end in self.column_widths.items(): for col_start, col_end in self.column_widths.items():
if col_start <= col_id and (col_end) >= col_id: if (col_start - self.adjust_pad) <= col_id and (col_end + self.adjust_pad) >= col_id:
if col_start in adjusted_data: if col_start in adjusted_data:
adjusted_data[col_start] += ' ' + value.strip() adjusted_data[col_start] += ' ' + value.strip()
else: else:
adjusted_data[col_start] = value.strip() adjusted_data[col_start] = value.strip()
return adjusted_data.items() return adjusted_data.items()
@ -231,7 +224,7 @@ class ColumnCollector(object):
then this is probably a continuation. then this is probably a continuation.
""" """
if self.data: if self.data and data:
keys = dict(self.column_widths).keys() keys = dict(self.column_widths).keys()
keys.sort() keys.sort()
keys += [None] keys += [None]
@ -244,20 +237,11 @@ class ColumnCollector(object):
position = keys.index(first_key) position = keys.index(first_key)
max_length = keys[position + 1] max_length = keys[position + 1]
print 'test', len(first_value), max_length
if max_length: if max_length:
return len(first_value) > max_length or len(data) == self.max_data_length return len(first_value) > max_length or len(data) == self.max_data_length
return False return False
#for key, nextkey in map(lambda x:(keys[x], keys[x+1]), range(len(keys)-1)):
# print 'key', key, nextkey
first_key, first_value = dict(data).items()[0]
if self.data:
#print self.data.keys()[0], first_key, first_value, self.column_widths
return self.data.keys()[0] == first_key # and len(first_value) > self.column_widths[first_key]
return False
@property @property
def tuple(self): def tuple(self):

View file

@ -32,8 +32,20 @@ doc = PDFRecordFinder(source_file)
records = doc.records() records = doc.records()
builder = RecordBuilder() builder = RecordBuilder()
def record_begins_at(record):
return int(record[1][1].data.values()[0].split('-')[0], 10)
def record_ends_at(record):
return record[1][-1].data
return int(record[1][-1].data.values()[0].split('-')[-1], 10)
for rec in records: for rec in records:
print record_begins_at(rec) #, 'to', record_ends_at(rec)
# FIXME record_ends_at is randomly exploding due to record data being
# a lump of text and not necessarily a field entry. I assume
# this is cleaned out by the record builder class.
sys.stdout.write("class %s(object):\n" % re.sub('[^\w]','',rec[0])) sys.stdout.write("class %s(object):\n" % re.sub('[^\w]','',rec[0]))
for field in builder.load(map(lambda x:x.tuple, rec[1][1:])): for field in builder.load(map(lambda x:x.tuple, rec[1][1:])):