added ColumnCollector, fixed column parsing by scanning for whitespace before separating

This commit is contained in:
Binh 2012-06-26 15:55:18 -05:00
parent fecd14db59
commit 6b5eb30f34

View file

@ -7,6 +7,7 @@ import pdb
""" pdftotext -layout -nopgbrk p1220.pdf - """
class PDFRecordFinder(object):
def __init__(self, src, heading_exp=None):
if not heading_exp:
@ -16,21 +17,49 @@ class PDFRecordFinder(object):
pdftext = subprocess.check_output(opts)
self.textrows = pdftext.split('\n')
self.heading_exp = heading_exp
@property
def rows(self):
for row in self.textrows:
yield row
"""
@property
def records(self):
row_iter = self.rows
for r in row_iter:
if self.heading_exp.match(r):
record = self.extract_record(row_iter)
yield record
row_iter = iter(self.textrows)
try:
while 1:
row = row_iter.next()
if self.heading_exp.match(row):
record = self.extract_record(row_iter)
yield record
except Exception, e:
raise e
#for r in self.textrows:
# if self.heading_exp.match(r):
# record = self.extract_record()
# yield record
"""
def extract_record(self, row_iter):
def columns(self):
results = []
cc = ColumnCollector()
for group in self.record_grouping():
for row in group:
if cc.is_next_record(row):
yield cc
cc = ColumnCollector()
print row
cc.add(row)
def record_grouping(self):
row_iter = iter(self.textrows)
i = 0
for row in row_iter:
i += 1
if self.heading_exp.match(row):
yield self.extract_record_columns(row_iter)
def extract_record_columns(self, row_iter):
re_multiwhite = re.compile(r'\s{2,}')
result = []
full_width_text_count = 0
@ -40,6 +69,9 @@ class PDFRecordFinder(object):
if not row:
continue
#if row.strip().startswith('Code'):
# pdb.set_trace()
# IF LINE CONTAINS MULTIPLE WHITESPACE, IT IS PROBABLY A TABLE
if not re_multiwhite.search(row):
full_width_text_count += 1
@ -48,7 +80,39 @@ class PDFRecordFinder(object):
# return result
continue
#pdb.set_trace()
white_ranges = [0,]
pos = 0
match = True
while pos < len(row):
match = re_multiwhite.search(row[pos:])
if match:
white_ranges.append(pos + match.start())
white_ranges.append(pos + match.end())
pos += match.end()
else:
white_ranges.append(len(row))
pos = len(row)
row_result = []
white_iter = iter(white_ranges)
while white_iter:
try:
start = white_iter.next()
end = white_iter.next()
if start != end:
row_result.append(
(start, row[start:end])
)
except StopIteration:
white_iter = None
print white_ranges
yield row_result
result.append(row_result)
"""
row_result = []
pos = 0
while pos < len(row):
@ -61,9 +125,56 @@ class PDFRecordFinder(object):
else:
if match:
pos += match.end()
row_result.append((pos,row[pos:],'b'))
row_result.append((pos,row[pos:]))
pos += len(row)
result.append(row_result)
row_result = []
return result
"""
#return result
class UnknownColumn(Exception):
pass
class IsNextRecord(Exception):
pass
class ColumnCollector(object):
def __init__(self, initial=None):
self.data = None
pass
def add(self, data):
if not self.data:
self.data = dict(data)
else:
if self.is_next_record(data):
raise IsNextRecord()
for col_id, value in data:
self.merge_column(col_id, value)
def merge_column(self, col_id, value):
if col_id in self.data.keys():
self.data[col_id] += ' ' + value.strip()
else:
# try adding a wiggle room value?
raise UnknownColumn
def is_next_record(self, data):
"""
If the first key value contains a string
and we already have some data in the record,
then this row is probably the beginning of
the next field. Raise an exception and continue
on with a fresh ColumnCollector.
"""
first_key = dict(data).keys()[0]
if self.data:
print self.data.keys()[0], first_key
return self.data.keys()[0] == first_key
return False