We need to remove some of the yield statements because it's making iteration

very confusing to keep track of, due to global iterators being passed around
and iterated over in chunks.

I've added a located_heading_rows method which scans the entire document
for row numbers that look like record definition headings. I think we
can use these number spans to feed into the row columnizer stuff.
This commit is contained in:
Binh 2012-06-30 15:21:05 -05:00
parent 6b5eb30f34
commit b77b80e485

View file

@ -19,34 +19,21 @@ class PDFRecordFinder(object):
self.heading_exp = heading_exp
"""
@property
def records(self):
row_iter = iter(self.textrows)
try:
while 1:
row = row_iter.next()
if self.heading_exp.match(row):
record = self.extract_record(row_iter)
yield record
except Exception, e:
raise e
#for r in self.textrows:
# if self.heading_exp.match(r):
# record = self.extract_record()
# yield record
"""
def columns(self):
results = []
cc = ColumnCollector()
for group in self.record_grouping():
for heading, group in self.record_grouping():
print "HEADING", heading
for row in group:
if cc.is_next_record(row):
if cc.is_next_field(row):
yield cc
cc = ColumnCollector()
print row
cc.add(row)
#print row
try:
cc.add(row)
except UnknownColumn, e:
results.append(cc)
cc = ColumnCollector()
@ -55,31 +42,114 @@ class PDFRecordFinder(object):
i = 0
for row in row_iter:
i += 1
if self.heading_exp.match(row):
yield self.extract_record_columns(row_iter)
match = self.heading_exp.match(row)
if match:
yield (match.groups(), self.extract_record_columns(row_iter))
"""
def extract_record_columns(self, row_iter):
re_multiwhite = re.compile(r'\s{2,}')
result = []
full_width_text_count = 0
def locate_heading_rows(self):
results = []
for (i, row) in enumerate(self.textrows):
match = self.heading_exp.match(row)
if match:
if not ''.join(match.groups()).lower().endswith('(continued)'):
results.append((i, ''.join(match.groups())))
return results
def records2(self):
row_iter = iter(self.textrows)
record = True
while record:
record = self.extract_record(row_iter)
yield record
def extract_record(self, row_iter):
heading = self.find_heading(row_iter)
fields = self.find_fields(row_iter)
return heading, list(fields)
def find_heading(self, row_iter):
for row in row_iter:
heading_match = self.heading_exp.match(row)
if heading_match:
return heading_match.groups()
def find_fields(self, row_iter):
cc = ColumnCollector()
for r in row_iter:
row = r.decode('UTF-8')
row = self.extract_columns_from_row(r)
if not row:
continue
#if row.strip().startswith('Code'):
# pdb.set_trace()
if cc.is_next_field(row):
yield cc
cc = ColumnCollector()
try:
cc.add(row)
except UnknownColumn, e:
print 'UNKNOWN COLUMN', row
raise StopIteration
def extract_columns_from_row(self, row):
re_multiwhite = re.compile(r'\s{2,}')
# IF LINE DOESN'T CONTAIN MULTIPLE WHITESPACES, IT'S LIKELY NOT A TABLE
if not re_multiwhite.search(row):
return None
white_ranges = [0,]
pos = 0
while pos < len(row):
match = re_multiwhite.search(row[pos:])
if match:
white_ranges.append(pos + match.start())
white_ranges.append(pos + match.end())
pos += match.end()
else:
white_ranges.append(len(row))
pos = len(row)
row_result = []
white_iter = iter(white_ranges)
while white_iter:
try:
start = white_iter.next()
end = white_iter.next()
if start != end:
row_result.append(
(start, row[start:end])
)
except StopIteration:
white_iter = None
return row_result
def extract_row_columns(self, row_iter):
re_multiwhite = re.compile(r'\s{2,}')
full_width_text_count = 0
#for r in row_iter:
row = None
while not row:
row = row_iter.next()
row = row.decode('UTF-8')
# IF LINE CONTAINS MULTIPLE WHITESPACE, IT IS PROBABLY A TABLE
if not re_multiwhite.search(row):
full_width_text_count += 1
#if full_width_text_count > 2:
# print 'full width text count exceeded limit'
# return result
continue
row = None
if True:
white_ranges = [0,]
pos = 0
match = True
@ -107,9 +177,11 @@ class PDFRecordFinder(object):
except StopIteration:
white_iter = None
print white_ranges
yield row_result
result.append(row_result)
#print white_ranges
return row_result
#yield row_result
#result.append(row_result)
"""
@ -136,7 +208,7 @@ class PDFRecordFinder(object):
class UnknownColumn(Exception):
pass
class IsNextRecord(Exception):
class IsNextField(Exception):
pass
class ColumnCollector(object):
@ -148,8 +220,8 @@ class ColumnCollector(object):
if not self.data:
self.data = dict(data)
else:
if self.is_next_record(data):
raise IsNextRecord()
if self.is_next_field(data):
raise IsNextField()
for col_id, value in data:
self.merge_column(col_id, value)
@ -161,7 +233,7 @@ class ColumnCollector(object):
# try adding a wiggle room value?
raise UnknownColumn
def is_next_record(self, data):
def is_next_field(self, data):
"""
If the first key value contains a string
and we already have some data in the record,
@ -171,7 +243,6 @@ class ColumnCollector(object):
"""
first_key = dict(data).keys()[0]
if self.data:
print self.data.keys()[0], first_key
return self.data.keys()[0] == first_key
return False