Record merging seems to work now that header offsets have been corrected.
There's an issue parsing p1220 on line 2570. Maybe making the parser ignore full-width lines during parsing would fix the problem, if there's some way to check the length of a row, only counting single-spaced words?
This commit is contained in:
parent
6e4a975cfb
commit
e6e087ef38
2 changed files with 7 additions and 10 deletions
|
@ -22,7 +22,6 @@ class PDFRecordFinder(object):
|
|||
self.field_heading_exp = field_heading_exp
|
||||
|
||||
def records(self):
|
||||
#headings = self.locate_heading_rows()
|
||||
headings = self.locate_heading_rows_by_field()
|
||||
|
||||
for (start, end, name) in headings:
|
||||
|
@ -55,7 +54,7 @@ class PDFRecordFinder(object):
|
|||
position -= 1
|
||||
|
||||
name = ''.join(header).strip().decode('ascii','ignore')
|
||||
results.append((i, name))
|
||||
results.append((i, name, position))
|
||||
else:
|
||||
# See if this row forces us to break from field reading.
|
||||
if re.search('Record\ Layout', row):
|
||||
|
@ -65,7 +64,7 @@ class PDFRecordFinder(object):
|
|||
for (a, b) in zip(results, results[1:] + [(len(self.textrows), None)]):
|
||||
end_pos = None
|
||||
|
||||
print a[0], record_break[0], b[0]-1
|
||||
#print a[0], record_break[0], b[0]-1
|
||||
|
||||
while record_break and record_break[0] < a[0]:
|
||||
record_break = record_break[1:]
|
||||
|
@ -76,7 +75,7 @@ class PDFRecordFinder(object):
|
|||
else:
|
||||
end_pos = b[0]-1
|
||||
|
||||
merged.append( (a[0], end_pos, a[1]) )
|
||||
merged.append( (a[0], end_pos-1, a[1]) )
|
||||
return merged
|
||||
|
||||
def locate_heading_rows(self):
|
||||
|
@ -110,7 +109,6 @@ class PDFRecordFinder(object):
|
|||
|
||||
for r in row_iter:
|
||||
row = self.extract_columns_from_row(r.decode('UTF-8'))
|
||||
|
||||
if not row:
|
||||
cc.empty_row()
|
||||
continue
|
||||
|
@ -249,7 +247,6 @@ class ColumnCollector(object):
|
|||
def adjust_columns(self, data):
|
||||
adjusted_data = {}
|
||||
|
||||
|
||||
for col_id, value in data:
|
||||
if col_id in self.data.keys():
|
||||
adjusted_data[col_id] = value.strip()
|
||||
|
|
|
@ -65,11 +65,11 @@ for rec in records:
|
|||
# a lump of text and not necessarily a field entry. I assume
|
||||
# this is cleaned out by the record builder class.
|
||||
|
||||
print last_record_ends_at + 1, begins_at
|
||||
#if last_record_ends_at + 1 != begins_at:
|
||||
sys.stdout.write("\nclass %s(object):\n" % re.sub('[^\w]','',rec[0]))
|
||||
#print last_record_ends_at + 1, begins_at
|
||||
if last_record_ends_at + 1 != begins_at:
|
||||
sys.stdout.write("\nclass %s(object):\n" % re.sub('[^\w]','',rec[0].split(':')[-1]))
|
||||
|
||||
for field in builder.load(map(lambda x:x.tuple, rec[1][1:])):
|
||||
for field in builder.load(map(lambda x:x.tuple, rec[1][0:])):
|
||||
sys.stdout.write('\t' + field + '\n')
|
||||
#print field
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue