Almost have things working. It seems like some of the record results

are overlapping. I'm assuming this is due to missing a continue
or something inside the ColumnCollector. I added a couple new IsNextRecord
exceptions in response to blank rows, but this may be causing more problems
than expected. Next step is probably to check the records returned, and verify
that nothing is being duplicated. Some of the duplicates may be filtered out
by the RecordBuilder class, or during the fields filtering in the pyaccuwage-pdfparse
script (see: fields).
This commit is contained in:
Binh 2012-11-20 16:05:36 -06:00
parent 1c7533973a
commit 31ff97db8a
2 changed files with 45 additions and 14 deletions

View file

@ -32,23 +32,46 @@ doc = PDFRecordFinder(source_file)
records = doc.records()
builder = RecordBuilder()
def record_begins_at(record):
return int(record[1][1].data.values()[0].split('-')[0], 10)
def record_begins_at(field):
return int(fields[0].data.values()[0].split('-')[0], 10)
def record_ends_at(record):
return record[1][-1].data
return int(record[1][-1].data.values()[0].split('-')[-1], 10)
def record_ends_at(fields):
return int(fields[-1].data.values()[0].split('-')[-1], 10)
last_record_begins_at = -1
last_record_ends_at = -1
for rec in records:
#if not rec[1]:
# continue # no actual fields detected
fields = rec[1]
# strip out fields that are not 4 items long
fields = filter(lambda x:len(x.tuple) == 4, fields)
# strip fields that don't begin at position 0
fields = filter(lambda x: 0 in x.data, fields)
# strip fields that don't have a length-range type item in position 0
fields = filter(lambda x: re.match('^\d+[-]?\d*$', x.data[0]), fields)
if not fields:
continue
begins_at = record_begins_at(fields)
ends_at = record_ends_at(fields)
print record_begins_at(rec) #, 'to', record_ends_at(rec)
# FIXME record_ends_at is randomly exploding due to record data being
# a lump of text and not necessarily a field entry. I assume
# this is cleaned out by the record builder class.
sys.stdout.write("class %s(object):\n" % re.sub('[^\w]','',rec[0]))
print last_record_ends_at + 1, begins_at
#if last_record_ends_at + 1 != begins_at:
sys.stdout.write("\nclass %s(object):\n" % re.sub('[^\w]','',rec[0]))
for field in builder.load(map(lambda x:x.tuple, rec[1][1:])):
sys.stdout.write('\t' + field + '\n')
#print field
last_record_ends_at = ends_at