Record detection seems to be working much better. We currently have

an issue where full-page width blocks are being interpreted as a
single large column, and then subsequent field definition columns
are being truncated in as subcolumns.

The current problematic line in p1220 is 1598.

Maybe add some functionality which lets us specify the number of
columns we're most interested in? Automatically discard 1-column
ColumnCollectors maybe?
This commit is contained in:
Binh 2012-11-06 15:34:35 -06:00
parent 46755dd90d
commit fe4bd20bad

View file

@ -31,10 +31,8 @@ class PDFRecordFinder(object):
for (i, row) in enumerate(self.textrows): for (i, row) in enumerate(self.textrows):
match = self.heading_exp.match(row) match = self.heading_exp.match(row)
if match: if match:
print i,match.groups()
#if not ''.join(match.groups()).lower().endswith('(continued)'):
results.append((i, ''.join(match.groups()))) results.append((i, ''.join(match.groups())))
""" """
results2 = [] results2 = []
for r in results: for r in results:
@ -52,22 +50,49 @@ class PDFRecordFinder(object):
def find_fields(self, row_iter): def find_fields(self, row_iter):
cc = ColumnCollector() cc = ColumnCollector()
for r in row_iter: for r in row_iter:
row = self.extract_columns_from_row(r.decode('UTF-8')) row = self.extract_columns_from_row(r.decode('UTF-8'))
if not row: if not row:
continue continue
if cc.is_next_field(row):
if row[1][1] == 'Vendor Indicator': #if cc.is_next_field(row):
import pdb # print len(cc.data)
pdb.set_trace() # yield cc
yield cc # cc = ColumnCollector()
cc = ColumnCollector()
try: try:
cc.add(row) cc.add(row)
except IsNextField, e:
yield cc
cc = ColumnCollector()
cc.add(row)
except UnknownColumn, e:
raise StopIteration
yield cc
def find_fields_old(self, row_iter):
cc = ColumnCollector()
for r in row_iter:
row = self.extract_columns_from_row(r.decode('UTF-8'))
if not row:
continue
if cc.is_next_field(row):
#if row[1][1] == 'Vendor Indicator':
# import pdb
# pdb.set_trace()
yield cc
cc = ColumnCollector()
try:
cc.add(row)
except UnknownColumn, e: except UnknownColumn, e:
raise StopIteration raise StopIteration
yield cc yield cc
@ -77,8 +102,8 @@ class PDFRecordFinder(object):
re_multiwhite = re.compile(r'\s{2,}') re_multiwhite = re.compile(r'\s{2,}')
# IF LINE DOESN'T CONTAIN MULTIPLE WHITESPACES, IT'S LIKELY NOT A TABLE # IF LINE DOESN'T CONTAIN MULTIPLE WHITESPACES, IT'S LIKELY NOT A TABLE
if not re_multiwhite.search(row): #if not re_multiwhite.search(row):
return None # return None
white_ranges = [0,] white_ranges = [0,]
pos = 0 pos = 0
@ -102,13 +127,13 @@ class PDFRecordFinder(object):
row_result.append( row_result.append(
(start, row[start:end].encode('ascii','ignore')) (start, row[start:end].encode('ascii','ignore'))
) )
except StopIteration: except StopIteration:
white_iter = None white_iter = None
return row_result return row_result
class UnknownColumn(Exception): class UnknownColumn(Exception):
pass pass
@ -118,9 +143,36 @@ class IsNextField(Exception):
class ColumnCollector(object): class ColumnCollector(object):
def __init__(self, initial=None): def __init__(self, initial=None):
self.data = None self.data = None
self.column_widths = None
self.max_data_length = 0
pass pass
def add(self, data): def add(self, data):
if not self.data:
self.data = dict(data)
else:
data = self.adjust_columns(data)
if self.is_next_field(data):
raise IsNextField()
for col_id, value in data:
self.merge_column(col_id, value)
self.update_column_widths(data)
def update_column_widths(self, data):
self.last_data_length = len(data)
self.max_data_length = max(self.max_data_length, len(data))
if not self.column_widths:
self.column_widths = dict(map(lambda (column, value): [column, column + len(value)], data))
else:
for col_id, value in data:
try:
self.column_widths[col_id] = max(self.column_widths[col_id], col_id + len(value.strip()))
except KeyError:
pass
def add_old(self, data):
if not self.data: if not self.data:
self.data = dict(data) self.data = dict(data)
else: else:
@ -128,12 +180,28 @@ class ColumnCollector(object):
raise IsNextField() raise IsNextField()
for col_id, value in data: for col_id, value in data:
self.merge_column(col_id, value) self.merge_column(col_id, value)
def adjust_columns(self, data):
adjusted_data = {}
for col_id, value in data:
if col_id in self.data.keys():
adjusted_data[col_id] = value.strip()
else:
for col_start, col_end in self.column_widths.items():
if col_start <= col_id and (col_end) >= col_id:
if col_start in adjusted_data:
adjusted_data[col_start] += ' ' + value.strip()
else:
adjusted_data[col_start] = value.strip()
return adjusted_data.items()
def merge_column(self, col_id, value): def merge_column(self, col_id, value):
if col_id in self.data.keys(): if col_id in self.data.keys():
self.data[col_id] += ' ' + value.strip() self.data[col_id] += ' ' + value.strip()
else:
else:
# try adding a wiggle room value? # try adding a wiggle room value?
# FIXME: # FIXME:
# Sometimes description columns contain column-like # Sometimes description columns contain column-like
@ -142,8 +210,14 @@ class ColumnCollector(object):
# after the maximum column, and assume it's part of the # after the maximum column, and assume it's part of the
# max column? # max column?
"""
for col_start, col_end in self.column_widths.items():
if col_start <= col_id and (col_end) >= col_id:
self.data[col_start] += ' ' + value.strip()
return
"""
raise UnknownColumn raise UnknownColumn
def is_next_field(self, data): def is_next_field(self, data):
""" """
If the first key value contains a string If the first key value contains a string
@ -152,9 +226,37 @@ class ColumnCollector(object):
the next field. Raise an exception and continue the next field. Raise an exception and continue
on with a fresh ColumnCollector. on with a fresh ColumnCollector.
""" """
first_key = dict(data).keys()[0]
""" If the length of the value in column_id is less than the position of the next column_id,
then this is probably a continuation.
"""
if self.data: if self.data:
return self.data.keys()[0] == first_key keys = dict(self.column_widths).keys()
keys.sort()
keys += [None]
if self.last_data_length < len(data):
return True
first_key, first_value = dict(data).items()[0]
if self.data.keys()[0] == first_key:
position = keys.index(first_key)
max_length = keys[position + 1]
print 'test', len(first_value), max_length
if max_length:
return len(first_value) > max_length or len(data) == self.max_data_length
return False
#for key, nextkey in map(lambda x:(keys[x], keys[x+1]), range(len(keys)-1)):
# print 'key', key, nextkey
first_key, first_value = dict(data).items()[0]
if self.data:
#print self.data.keys()[0], first_key, first_value, self.column_widths
return self.data.keys()[0] == first_key # and len(first_value) > self.column_widths[first_key]
return False return False
@property @property