We need to remove some of the yield statements because it's making iteration
very confusing to keep track of, due to global iterators being passed around and iterated over in chunks. I've added a located_heading_rows method which scans the entire document for row numbers that look like record definition headings. I think we can use these number spans to feed into the row columnizer stuff.
This commit is contained in:
parent
6b5eb30f34
commit
b77b80e485
1 changed files with 116 additions and 45 deletions
|
@ -17,36 +17,23 @@ class PDFRecordFinder(object):
|
||||||
pdftext = subprocess.check_output(opts)
|
pdftext = subprocess.check_output(opts)
|
||||||
self.textrows = pdftext.split('\n')
|
self.textrows = pdftext.split('\n')
|
||||||
self.heading_exp = heading_exp
|
self.heading_exp = heading_exp
|
||||||
|
|
||||||
"""
|
|
||||||
@property
|
|
||||||
def records(self):
|
|
||||||
row_iter = iter(self.textrows)
|
|
||||||
try:
|
|
||||||
while 1:
|
|
||||||
row = row_iter.next()
|
|
||||||
if self.heading_exp.match(row):
|
|
||||||
record = self.extract_record(row_iter)
|
|
||||||
yield record
|
|
||||||
except Exception, e:
|
|
||||||
raise e
|
|
||||||
#for r in self.textrows:
|
|
||||||
# if self.heading_exp.match(r):
|
|
||||||
# record = self.extract_record()
|
|
||||||
# yield record
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
def columns(self):
|
def columns(self):
|
||||||
results = []
|
results = []
|
||||||
cc = ColumnCollector()
|
cc = ColumnCollector()
|
||||||
for group in self.record_grouping():
|
for heading, group in self.record_grouping():
|
||||||
|
print "HEADING", heading
|
||||||
for row in group:
|
for row in group:
|
||||||
if cc.is_next_record(row):
|
if cc.is_next_field(row):
|
||||||
yield cc
|
yield cc
|
||||||
cc = ColumnCollector()
|
cc = ColumnCollector()
|
||||||
print row
|
#print row
|
||||||
cc.add(row)
|
try:
|
||||||
|
cc.add(row)
|
||||||
|
except UnknownColumn, e:
|
||||||
|
results.append(cc)
|
||||||
|
cc = ColumnCollector()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -55,31 +42,114 @@ class PDFRecordFinder(object):
|
||||||
i = 0
|
i = 0
|
||||||
for row in row_iter:
|
for row in row_iter:
|
||||||
i += 1
|
i += 1
|
||||||
if self.heading_exp.match(row):
|
match = self.heading_exp.match(row)
|
||||||
yield self.extract_record_columns(row_iter)
|
if match:
|
||||||
|
yield (match.groups(), self.extract_record_columns(row_iter))
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
def extract_record_columns(self, row_iter):
|
def locate_heading_rows(self):
|
||||||
re_multiwhite = re.compile(r'\s{2,}')
|
results = []
|
||||||
result = []
|
for (i, row) in enumerate(self.textrows):
|
||||||
full_width_text_count = 0
|
match = self.heading_exp.match(row)
|
||||||
|
if match:
|
||||||
|
if not ''.join(match.groups()).lower().endswith('(continued)'):
|
||||||
|
results.append((i, ''.join(match.groups())))
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def records2(self):
|
||||||
|
row_iter = iter(self.textrows)
|
||||||
|
record = True
|
||||||
|
while record:
|
||||||
|
record = self.extract_record(row_iter)
|
||||||
|
yield record
|
||||||
|
|
||||||
|
|
||||||
|
def extract_record(self, row_iter):
|
||||||
|
heading = self.find_heading(row_iter)
|
||||||
|
fields = self.find_fields(row_iter)
|
||||||
|
return heading, list(fields)
|
||||||
|
|
||||||
|
|
||||||
|
def find_heading(self, row_iter):
|
||||||
|
for row in row_iter:
|
||||||
|
heading_match = self.heading_exp.match(row)
|
||||||
|
if heading_match:
|
||||||
|
return heading_match.groups()
|
||||||
|
|
||||||
|
|
||||||
|
def find_fields(self, row_iter):
|
||||||
|
cc = ColumnCollector()
|
||||||
for r in row_iter:
|
for r in row_iter:
|
||||||
row = r.decode('UTF-8')
|
row = self.extract_columns_from_row(r)
|
||||||
if not row:
|
if not row:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
if cc.is_next_field(row):
|
||||||
|
yield cc
|
||||||
|
cc = ColumnCollector()
|
||||||
|
try:
|
||||||
|
cc.add(row)
|
||||||
|
except UnknownColumn, e:
|
||||||
|
print 'UNKNOWN COLUMN', row
|
||||||
|
raise StopIteration
|
||||||
|
|
||||||
#if row.strip().startswith('Code'):
|
|
||||||
# pdb.set_trace()
|
|
||||||
|
|
||||||
|
def extract_columns_from_row(self, row):
|
||||||
|
re_multiwhite = re.compile(r'\s{2,}')
|
||||||
|
|
||||||
|
# IF LINE DOESN'T CONTAIN MULTIPLE WHITESPACES, IT'S LIKELY NOT A TABLE
|
||||||
|
if not re_multiwhite.search(row):
|
||||||
|
return None
|
||||||
|
|
||||||
|
white_ranges = [0,]
|
||||||
|
pos = 0
|
||||||
|
while pos < len(row):
|
||||||
|
match = re_multiwhite.search(row[pos:])
|
||||||
|
if match:
|
||||||
|
white_ranges.append(pos + match.start())
|
||||||
|
white_ranges.append(pos + match.end())
|
||||||
|
pos += match.end()
|
||||||
|
else:
|
||||||
|
white_ranges.append(len(row))
|
||||||
|
pos = len(row)
|
||||||
|
|
||||||
|
row_result = []
|
||||||
|
white_iter = iter(white_ranges)
|
||||||
|
while white_iter:
|
||||||
|
try:
|
||||||
|
start = white_iter.next()
|
||||||
|
end = white_iter.next()
|
||||||
|
if start != end:
|
||||||
|
row_result.append(
|
||||||
|
(start, row[start:end])
|
||||||
|
)
|
||||||
|
|
||||||
|
except StopIteration:
|
||||||
|
white_iter = None
|
||||||
|
|
||||||
|
return row_result
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def extract_row_columns(self, row_iter):
|
||||||
|
re_multiwhite = re.compile(r'\s{2,}')
|
||||||
|
full_width_text_count = 0
|
||||||
|
|
||||||
|
#for r in row_iter:
|
||||||
|
row = None
|
||||||
|
while not row:
|
||||||
|
row = row_iter.next()
|
||||||
|
row = row.decode('UTF-8')
|
||||||
|
|
||||||
# IF LINE CONTAINS MULTIPLE WHITESPACE, IT IS PROBABLY A TABLE
|
# IF LINE CONTAINS MULTIPLE WHITESPACE, IT IS PROBABLY A TABLE
|
||||||
if not re_multiwhite.search(row):
|
if not re_multiwhite.search(row):
|
||||||
full_width_text_count += 1
|
full_width_text_count += 1
|
||||||
#if full_width_text_count > 2:
|
row = None
|
||||||
# print 'full width text count exceeded limit'
|
|
||||||
# return result
|
if True:
|
||||||
continue
|
|
||||||
|
|
||||||
white_ranges = [0,]
|
white_ranges = [0,]
|
||||||
pos = 0
|
pos = 0
|
||||||
match = True
|
match = True
|
||||||
|
@ -107,9 +177,11 @@ class PDFRecordFinder(object):
|
||||||
|
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
white_iter = None
|
white_iter = None
|
||||||
print white_ranges
|
|
||||||
yield row_result
|
#print white_ranges
|
||||||
result.append(row_result)
|
return row_result
|
||||||
|
#yield row_result
|
||||||
|
#result.append(row_result)
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
@ -136,7 +208,7 @@ class PDFRecordFinder(object):
|
||||||
class UnknownColumn(Exception):
|
class UnknownColumn(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class IsNextRecord(Exception):
|
class IsNextField(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class ColumnCollector(object):
|
class ColumnCollector(object):
|
||||||
|
@ -148,8 +220,8 @@ class ColumnCollector(object):
|
||||||
if not self.data:
|
if not self.data:
|
||||||
self.data = dict(data)
|
self.data = dict(data)
|
||||||
else:
|
else:
|
||||||
if self.is_next_record(data):
|
if self.is_next_field(data):
|
||||||
raise IsNextRecord()
|
raise IsNextField()
|
||||||
for col_id, value in data:
|
for col_id, value in data:
|
||||||
self.merge_column(col_id, value)
|
self.merge_column(col_id, value)
|
||||||
|
|
||||||
|
@ -161,7 +233,7 @@ class ColumnCollector(object):
|
||||||
# try adding a wiggle room value?
|
# try adding a wiggle room value?
|
||||||
raise UnknownColumn
|
raise UnknownColumn
|
||||||
|
|
||||||
def is_next_record(self, data):
|
def is_next_field(self, data):
|
||||||
"""
|
"""
|
||||||
If the first key value contains a string
|
If the first key value contains a string
|
||||||
and we already have some data in the record,
|
and we already have some data in the record,
|
||||||
|
@ -171,7 +243,6 @@ class ColumnCollector(object):
|
||||||
"""
|
"""
|
||||||
first_key = dict(data).keys()[0]
|
first_key = dict(data).keys()[0]
|
||||||
if self.data:
|
if self.data:
|
||||||
print self.data.keys()[0], first_key
|
|
||||||
return self.data.keys()[0] == first_key
|
return self.data.keys()[0] == first_key
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue