added ColumnCollector, fixed column parsing by scanning for whitespace before separating
This commit is contained in:
parent
fecd14db59
commit
6b5eb30f34
1 changed files with 127 additions and 16 deletions
|
@ -7,6 +7,7 @@ import pdb
|
|||
|
||||
""" pdftotext -layout -nopgbrk p1220.pdf - """
|
||||
|
||||
|
||||
class PDFRecordFinder(object):
|
||||
def __init__(self, src, heading_exp=None):
|
||||
if not heading_exp:
|
||||
|
@ -17,20 +18,48 @@ class PDFRecordFinder(object):
|
|||
self.textrows = pdftext.split('\n')
|
||||
self.heading_exp = heading_exp
|
||||
|
||||
@property
|
||||
def rows(self):
|
||||
for row in self.textrows:
|
||||
yield row
|
||||
|
||||
"""
|
||||
@property
|
||||
def records(self):
|
||||
row_iter = self.rows
|
||||
for r in row_iter:
|
||||
if self.heading_exp.match(r):
|
||||
record = self.extract_record(row_iter)
|
||||
yield record
|
||||
row_iter = iter(self.textrows)
|
||||
try:
|
||||
while 1:
|
||||
row = row_iter.next()
|
||||
if self.heading_exp.match(row):
|
||||
record = self.extract_record(row_iter)
|
||||
yield record
|
||||
except Exception, e:
|
||||
raise e
|
||||
#for r in self.textrows:
|
||||
# if self.heading_exp.match(r):
|
||||
# record = self.extract_record()
|
||||
# yield record
|
||||
"""
|
||||
|
||||
def extract_record(self, row_iter):
|
||||
|
||||
def columns(self):
|
||||
results = []
|
||||
cc = ColumnCollector()
|
||||
for group in self.record_grouping():
|
||||
for row in group:
|
||||
if cc.is_next_record(row):
|
||||
yield cc
|
||||
cc = ColumnCollector()
|
||||
print row
|
||||
cc.add(row)
|
||||
|
||||
|
||||
|
||||
def record_grouping(self):
|
||||
row_iter = iter(self.textrows)
|
||||
i = 0
|
||||
for row in row_iter:
|
||||
i += 1
|
||||
if self.heading_exp.match(row):
|
||||
yield self.extract_record_columns(row_iter)
|
||||
|
||||
|
||||
def extract_record_columns(self, row_iter):
|
||||
re_multiwhite = re.compile(r'\s{2,}')
|
||||
result = []
|
||||
full_width_text_count = 0
|
||||
|
@ -40,6 +69,9 @@ class PDFRecordFinder(object):
|
|||
if not row:
|
||||
continue
|
||||
|
||||
#if row.strip().startswith('Code'):
|
||||
# pdb.set_trace()
|
||||
|
||||
# IF LINE CONTAINS MULTIPLE WHITESPACE, IT IS PROBABLY A TABLE
|
||||
if not re_multiwhite.search(row):
|
||||
full_width_text_count += 1
|
||||
|
@ -48,7 +80,39 @@ class PDFRecordFinder(object):
|
|||
# return result
|
||||
continue
|
||||
|
||||
#pdb.set_trace()
|
||||
white_ranges = [0,]
|
||||
pos = 0
|
||||
match = True
|
||||
while pos < len(row):
|
||||
match = re_multiwhite.search(row[pos:])
|
||||
if match:
|
||||
white_ranges.append(pos + match.start())
|
||||
white_ranges.append(pos + match.end())
|
||||
pos += match.end()
|
||||
else:
|
||||
white_ranges.append(len(row))
|
||||
pos = len(row)
|
||||
|
||||
|
||||
row_result = []
|
||||
white_iter = iter(white_ranges)
|
||||
while white_iter:
|
||||
try:
|
||||
start = white_iter.next()
|
||||
end = white_iter.next()
|
||||
if start != end:
|
||||
row_result.append(
|
||||
(start, row[start:end])
|
||||
)
|
||||
|
||||
except StopIteration:
|
||||
white_iter = None
|
||||
print white_ranges
|
||||
yield row_result
|
||||
result.append(row_result)
|
||||
|
||||
|
||||
"""
|
||||
row_result = []
|
||||
pos = 0
|
||||
while pos < len(row):
|
||||
|
@ -61,9 +125,56 @@ class PDFRecordFinder(object):
|
|||
else:
|
||||
if match:
|
||||
pos += match.end()
|
||||
row_result.append((pos,row[pos:],'b'))
|
||||
row_result.append((pos,row[pos:]))
|
||||
pos += len(row)
|
||||
|
||||
result.append(row_result)
|
||||
row_result = []
|
||||
return result
|
||||
"""
|
||||
#return result
|
||||
|
||||
|
||||
class UnknownColumn(Exception):
|
||||
pass
|
||||
|
||||
class IsNextRecord(Exception):
|
||||
pass
|
||||
|
||||
class ColumnCollector(object):
|
||||
def __init__(self, initial=None):
|
||||
self.data = None
|
||||
pass
|
||||
|
||||
def add(self, data):
|
||||
if not self.data:
|
||||
self.data = dict(data)
|
||||
else:
|
||||
if self.is_next_record(data):
|
||||
raise IsNextRecord()
|
||||
for col_id, value in data:
|
||||
self.merge_column(col_id, value)
|
||||
|
||||
def merge_column(self, col_id, value):
|
||||
if col_id in self.data.keys():
|
||||
self.data[col_id] += ' ' + value.strip()
|
||||
|
||||
else:
|
||||
# try adding a wiggle room value?
|
||||
raise UnknownColumn
|
||||
|
||||
def is_next_record(self, data):
|
||||
"""
|
||||
If the first key value contains a string
|
||||
and we already have some data in the record,
|
||||
then this row is probably the beginning of
|
||||
the next field. Raise an exception and continue
|
||||
on with a fresh ColumnCollector.
|
||||
"""
|
||||
first_key = dict(data).keys()[0]
|
||||
if self.data:
|
||||
print self.data.keys()[0], first_key
|
||||
return self.data.keys()[0] == first_key
|
||||
return False
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue