Record merging seems to work now that header offsets have been corrected.
There's an issue parsing p1220 on line 2570. Maybe making the parser ignore full-width lines during parsing would fix the problem, if there's some way to check the length of a row, only counting single-spaced words?
This commit is contained in:
parent
6e4a975cfb
commit
e6e087ef38
2 changed files with 7 additions and 10 deletions
|
@ -22,7 +22,6 @@ class PDFRecordFinder(object):
|
||||||
self.field_heading_exp = field_heading_exp
|
self.field_heading_exp = field_heading_exp
|
||||||
|
|
||||||
def records(self):
|
def records(self):
|
||||||
#headings = self.locate_heading_rows()
|
|
||||||
headings = self.locate_heading_rows_by_field()
|
headings = self.locate_heading_rows_by_field()
|
||||||
|
|
||||||
for (start, end, name) in headings:
|
for (start, end, name) in headings:
|
||||||
|
@ -55,7 +54,7 @@ class PDFRecordFinder(object):
|
||||||
position -= 1
|
position -= 1
|
||||||
|
|
||||||
name = ''.join(header).strip().decode('ascii','ignore')
|
name = ''.join(header).strip().decode('ascii','ignore')
|
||||||
results.append((i, name))
|
results.append((i, name, position))
|
||||||
else:
|
else:
|
||||||
# See if this row forces us to break from field reading.
|
# See if this row forces us to break from field reading.
|
||||||
if re.search('Record\ Layout', row):
|
if re.search('Record\ Layout', row):
|
||||||
|
@ -65,7 +64,7 @@ class PDFRecordFinder(object):
|
||||||
for (a, b) in zip(results, results[1:] + [(len(self.textrows), None)]):
|
for (a, b) in zip(results, results[1:] + [(len(self.textrows), None)]):
|
||||||
end_pos = None
|
end_pos = None
|
||||||
|
|
||||||
print a[0], record_break[0], b[0]-1
|
#print a[0], record_break[0], b[0]-1
|
||||||
|
|
||||||
while record_break and record_break[0] < a[0]:
|
while record_break and record_break[0] < a[0]:
|
||||||
record_break = record_break[1:]
|
record_break = record_break[1:]
|
||||||
|
@ -76,7 +75,7 @@ class PDFRecordFinder(object):
|
||||||
else:
|
else:
|
||||||
end_pos = b[0]-1
|
end_pos = b[0]-1
|
||||||
|
|
||||||
merged.append( (a[0], end_pos, a[1]) )
|
merged.append( (a[0], end_pos-1, a[1]) )
|
||||||
return merged
|
return merged
|
||||||
|
|
||||||
def locate_heading_rows(self):
|
def locate_heading_rows(self):
|
||||||
|
@ -110,7 +109,6 @@ class PDFRecordFinder(object):
|
||||||
|
|
||||||
for r in row_iter:
|
for r in row_iter:
|
||||||
row = self.extract_columns_from_row(r.decode('UTF-8'))
|
row = self.extract_columns_from_row(r.decode('UTF-8'))
|
||||||
|
|
||||||
if not row:
|
if not row:
|
||||||
cc.empty_row()
|
cc.empty_row()
|
||||||
continue
|
continue
|
||||||
|
@ -249,7 +247,6 @@ class ColumnCollector(object):
|
||||||
def adjust_columns(self, data):
|
def adjust_columns(self, data):
|
||||||
adjusted_data = {}
|
adjusted_data = {}
|
||||||
|
|
||||||
|
|
||||||
for col_id, value in data:
|
for col_id, value in data:
|
||||||
if col_id in self.data.keys():
|
if col_id in self.data.keys():
|
||||||
adjusted_data[col_id] = value.strip()
|
adjusted_data[col_id] = value.strip()
|
||||||
|
|
|
@ -65,11 +65,11 @@ for rec in records:
|
||||||
# a lump of text and not necessarily a field entry. I assume
|
# a lump of text and not necessarily a field entry. I assume
|
||||||
# this is cleaned out by the record builder class.
|
# this is cleaned out by the record builder class.
|
||||||
|
|
||||||
print last_record_ends_at + 1, begins_at
|
#print last_record_ends_at + 1, begins_at
|
||||||
#if last_record_ends_at + 1 != begins_at:
|
if last_record_ends_at + 1 != begins_at:
|
||||||
sys.stdout.write("\nclass %s(object):\n" % re.sub('[^\w]','',rec[0]))
|
sys.stdout.write("\nclass %s(object):\n" % re.sub('[^\w]','',rec[0].split(':')[-1]))
|
||||||
|
|
||||||
for field in builder.load(map(lambda x:x.tuple, rec[1][1:])):
|
for field in builder.load(map(lambda x:x.tuple, rec[1][0:])):
|
||||||
sys.stdout.write('\t' + field + '\n')
|
sys.stdout.write('\t' + field + '\n')
|
||||||
#print field
|
#print field
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue