Record merging seems to work now that header offsets have been corrected.

There's an issue parsing p1220 on line 2570. Maybe making the parser ignore
full-width lines during parsing would fix the problem, if there's some
way to check the length of a row, only counting single-spaced words?
This commit is contained in:
Binh 2013-01-29 15:48:32 -06:00
parent 6e4a975cfb
commit e6e087ef38
2 changed files with 7 additions and 10 deletions

View file

@ -22,7 +22,6 @@ class PDFRecordFinder(object):
self.field_heading_exp = field_heading_exp
def records(self):
#headings = self.locate_heading_rows()
headings = self.locate_heading_rows_by_field()
for (start, end, name) in headings:
@ -55,7 +54,7 @@ class PDFRecordFinder(object):
position -= 1
name = ''.join(header).strip().decode('ascii','ignore')
results.append((i, name))
results.append((i, name, position))
else:
# See if this row forces us to break from field reading.
if re.search('Record\ Layout', row):
@ -65,7 +64,7 @@ class PDFRecordFinder(object):
for (a, b) in zip(results, results[1:] + [(len(self.textrows), None)]):
end_pos = None
print a[0], record_break[0], b[0]-1
#print a[0], record_break[0], b[0]-1
while record_break and record_break[0] < a[0]:
record_break = record_break[1:]
@ -76,7 +75,7 @@ class PDFRecordFinder(object):
else:
end_pos = b[0]-1
merged.append( (a[0], end_pos, a[1]) )
merged.append( (a[0], end_pos-1, a[1]) )
return merged
def locate_heading_rows(self):
@ -110,7 +109,6 @@ class PDFRecordFinder(object):
for r in row_iter:
row = self.extract_columns_from_row(r.decode('UTF-8'))
if not row:
cc.empty_row()
continue
@ -249,7 +247,6 @@ class ColumnCollector(object):
def adjust_columns(self, data):
adjusted_data = {}
for col_id, value in data:
if col_id in self.data.keys():
adjusted_data[col_id] = value.strip()

View file

@ -65,11 +65,11 @@ for rec in records:
# a lump of text and not necessarily a field entry. I assume
# this is cleaned out by the record builder class.
print last_record_ends_at + 1, begins_at
#if last_record_ends_at + 1 != begins_at:
sys.stdout.write("\nclass %s(object):\n" % re.sub('[^\w]','',rec[0]))
#print last_record_ends_at + 1, begins_at
if last_record_ends_at + 1 != begins_at:
sys.stdout.write("\nclass %s(object):\n" % re.sub('[^\w]','',rec[0].split(':')[-1]))
for field in builder.load(map(lambda x:x.tuple, rec[1][1:])):
for field in builder.load(map(lambda x:x.tuple, rec[1][0:])):
sys.stdout.write('\t' + field + '\n')
#print field