Almost have things working. It seems like some of the record results
are overlapping. I'm assuming this is due to missing a continue or something inside the ColumnCollector. I added a couple new IsNextRecord exceptions in response to blank rows, but this may be causing more problems than expected. Next step is probably to check the records returned, and verify that nothing is being duplicated. Some of the duplicates may be filtered out by the RecordBuilder class, or during the fields filtering in the pyaccuwage-pdfparse script (see: fields).
This commit is contained in:
parent
1c7533973a
commit
31ff97db8a
2 changed files with 45 additions and 14 deletions
|
@ -42,18 +42,15 @@ class PDFRecordFinder(object):
|
||||||
|
|
||||||
def find_fields(self, row_iter):
|
def find_fields(self, row_iter):
|
||||||
cc = ColumnCollector()
|
cc = ColumnCollector()
|
||||||
|
blank_row_counter = 0
|
||||||
|
|
||||||
for r in row_iter:
|
for r in row_iter:
|
||||||
row = self.extract_columns_from_row(r.decode('UTF-8'))
|
row = self.extract_columns_from_row(r.decode('UTF-8'))
|
||||||
|
|
||||||
if not row:
|
if not row:
|
||||||
|
cc.empty_row()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
#if cc.is_next_field(row):
|
|
||||||
# print len(cc.data)
|
|
||||||
# yield cc
|
|
||||||
# cc = ColumnCollector()
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
cc.add(row)
|
cc.add(row)
|
||||||
except IsNextField, e:
|
except IsNextField, e:
|
||||||
|
@ -137,9 +134,13 @@ class ColumnCollector(object):
|
||||||
self.column_widths = None
|
self.column_widths = None
|
||||||
self.max_data_length = 0
|
self.max_data_length = 0
|
||||||
self.adjust_pad = 3
|
self.adjust_pad = 3
|
||||||
|
self.empty_rows = 0
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def add(self, data):
|
def add(self, data):
|
||||||
|
if self.empty_rows > 2:
|
||||||
|
raise IsNextField()
|
||||||
|
|
||||||
if not self.data:
|
if not self.data:
|
||||||
self.data = dict(data)
|
self.data = dict(data)
|
||||||
else:
|
else:
|
||||||
|
@ -151,6 +152,9 @@ class ColumnCollector(object):
|
||||||
|
|
||||||
self.update_column_widths(data)
|
self.update_column_widths(data)
|
||||||
|
|
||||||
|
def empty_row(self):
|
||||||
|
self.empty_rows += 1
|
||||||
|
|
||||||
def update_column_widths(self, data):
|
def update_column_widths(self, data):
|
||||||
self.last_data_length = len(data)
|
self.last_data_length = len(data)
|
||||||
self.max_data_length = max(self.max_data_length, len(data))
|
self.max_data_length = max(self.max_data_length, len(data))
|
||||||
|
@ -177,6 +181,7 @@ class ColumnCollector(object):
|
||||||
def adjust_columns(self, data):
|
def adjust_columns(self, data):
|
||||||
adjusted_data = {}
|
adjusted_data = {}
|
||||||
|
|
||||||
|
|
||||||
for col_id, value in data:
|
for col_id, value in data:
|
||||||
if col_id in self.data.keys():
|
if col_id in self.data.keys():
|
||||||
adjusted_data[col_id] = value.strip()
|
adjusted_data[col_id] = value.strip()
|
||||||
|
@ -245,6 +250,9 @@ class ColumnCollector(object):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def tuple(self):
|
def tuple(self):
|
||||||
|
try:
|
||||||
return tuple(map(lambda k:self.data[k], sorted(self.data.keys())))
|
return tuple(map(lambda k:self.data[k], sorted(self.data.keys())))
|
||||||
|
except:
|
||||||
|
import pdb
|
||||||
|
pdb.set_trace()
|
||||||
|
|
||||||
|
|
|
@ -32,23 +32,46 @@ doc = PDFRecordFinder(source_file)
|
||||||
records = doc.records()
|
records = doc.records()
|
||||||
builder = RecordBuilder()
|
builder = RecordBuilder()
|
||||||
|
|
||||||
def record_begins_at(record):
|
def record_begins_at(field):
|
||||||
return int(record[1][1].data.values()[0].split('-')[0], 10)
|
return int(fields[0].data.values()[0].split('-')[0], 10)
|
||||||
|
|
||||||
def record_ends_at(record):
|
def record_ends_at(fields):
|
||||||
return record[1][-1].data
|
return int(fields[-1].data.values()[0].split('-')[-1], 10)
|
||||||
return int(record[1][-1].data.values()[0].split('-')[-1], 10)
|
|
||||||
|
last_record_begins_at = -1
|
||||||
|
last_record_ends_at = -1
|
||||||
|
|
||||||
for rec in records:
|
for rec in records:
|
||||||
|
#if not rec[1]:
|
||||||
|
# continue # no actual fields detected
|
||||||
|
fields = rec[1]
|
||||||
|
|
||||||
|
# strip out fields that are not 4 items long
|
||||||
|
fields = filter(lambda x:len(x.tuple) == 4, fields)
|
||||||
|
|
||||||
|
# strip fields that don't begin at position 0
|
||||||
|
fields = filter(lambda x: 0 in x.data, fields)
|
||||||
|
|
||||||
|
# strip fields that don't have a length-range type item in position 0
|
||||||
|
fields = filter(lambda x: re.match('^\d+[-]?\d*$', x.data[0]), fields)
|
||||||
|
|
||||||
|
if not fields:
|
||||||
|
continue
|
||||||
|
|
||||||
|
begins_at = record_begins_at(fields)
|
||||||
|
ends_at = record_ends_at(fields)
|
||||||
|
|
||||||
print record_begins_at(rec) #, 'to', record_ends_at(rec)
|
|
||||||
# FIXME record_ends_at is randomly exploding due to record data being
|
# FIXME record_ends_at is randomly exploding due to record data being
|
||||||
# a lump of text and not necessarily a field entry. I assume
|
# a lump of text and not necessarily a field entry. I assume
|
||||||
# this is cleaned out by the record builder class.
|
# this is cleaned out by the record builder class.
|
||||||
|
|
||||||
sys.stdout.write("class %s(object):\n" % re.sub('[^\w]','',rec[0]))
|
print last_record_ends_at + 1, begins_at
|
||||||
|
#if last_record_ends_at + 1 != begins_at:
|
||||||
|
sys.stdout.write("\nclass %s(object):\n" % re.sub('[^\w]','',rec[0]))
|
||||||
|
|
||||||
for field in builder.load(map(lambda x:x.tuple, rec[1][1:])):
|
for field in builder.load(map(lambda x:x.tuple, rec[1][1:])):
|
||||||
sys.stdout.write('\t' + field + '\n')
|
sys.stdout.write('\t' + field + '\n')
|
||||||
#print field
|
#print field
|
||||||
|
|
||||||
|
last_record_ends_at = ends_at
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue