added ColumnCollector, fixed column parsing by scanning for whitespace before separating
This commit is contained in:
parent
fecd14db59
commit
6b5eb30f34
1 changed files with 127 additions and 16 deletions
|
@ -7,6 +7,7 @@ import pdb
|
||||||
|
|
||||||
""" pdftotext -layout -nopgbrk p1220.pdf - """
|
""" pdftotext -layout -nopgbrk p1220.pdf - """
|
||||||
|
|
||||||
|
|
||||||
class PDFRecordFinder(object):
|
class PDFRecordFinder(object):
|
||||||
def __init__(self, src, heading_exp=None):
|
def __init__(self, src, heading_exp=None):
|
||||||
if not heading_exp:
|
if not heading_exp:
|
||||||
|
@ -17,20 +18,48 @@ class PDFRecordFinder(object):
|
||||||
self.textrows = pdftext.split('\n')
|
self.textrows = pdftext.split('\n')
|
||||||
self.heading_exp = heading_exp
|
self.heading_exp = heading_exp
|
||||||
|
|
||||||
@property
|
"""
|
||||||
def rows(self):
|
|
||||||
for row in self.textrows:
|
|
||||||
yield row
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def records(self):
|
def records(self):
|
||||||
row_iter = self.rows
|
row_iter = iter(self.textrows)
|
||||||
for r in row_iter:
|
try:
|
||||||
if self.heading_exp.match(r):
|
while 1:
|
||||||
|
row = row_iter.next()
|
||||||
|
if self.heading_exp.match(row):
|
||||||
record = self.extract_record(row_iter)
|
record = self.extract_record(row_iter)
|
||||||
yield record
|
yield record
|
||||||
|
except Exception, e:
|
||||||
|
raise e
|
||||||
|
#for r in self.textrows:
|
||||||
|
# if self.heading_exp.match(r):
|
||||||
|
# record = self.extract_record()
|
||||||
|
# yield record
|
||||||
|
"""
|
||||||
|
|
||||||
def extract_record(self, row_iter):
|
|
||||||
|
def columns(self):
|
||||||
|
results = []
|
||||||
|
cc = ColumnCollector()
|
||||||
|
for group in self.record_grouping():
|
||||||
|
for row in group:
|
||||||
|
if cc.is_next_record(row):
|
||||||
|
yield cc
|
||||||
|
cc = ColumnCollector()
|
||||||
|
print row
|
||||||
|
cc.add(row)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def record_grouping(self):
|
||||||
|
row_iter = iter(self.textrows)
|
||||||
|
i = 0
|
||||||
|
for row in row_iter:
|
||||||
|
i += 1
|
||||||
|
if self.heading_exp.match(row):
|
||||||
|
yield self.extract_record_columns(row_iter)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_record_columns(self, row_iter):
|
||||||
re_multiwhite = re.compile(r'\s{2,}')
|
re_multiwhite = re.compile(r'\s{2,}')
|
||||||
result = []
|
result = []
|
||||||
full_width_text_count = 0
|
full_width_text_count = 0
|
||||||
|
@ -40,6 +69,9 @@ class PDFRecordFinder(object):
|
||||||
if not row:
|
if not row:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
#if row.strip().startswith('Code'):
|
||||||
|
# pdb.set_trace()
|
||||||
|
|
||||||
# IF LINE CONTAINS MULTIPLE WHITESPACE, IT IS PROBABLY A TABLE
|
# IF LINE CONTAINS MULTIPLE WHITESPACE, IT IS PROBABLY A TABLE
|
||||||
if not re_multiwhite.search(row):
|
if not re_multiwhite.search(row):
|
||||||
full_width_text_count += 1
|
full_width_text_count += 1
|
||||||
|
@ -48,7 +80,39 @@ class PDFRecordFinder(object):
|
||||||
# return result
|
# return result
|
||||||
continue
|
continue
|
||||||
|
|
||||||
#pdb.set_trace()
|
white_ranges = [0,]
|
||||||
|
pos = 0
|
||||||
|
match = True
|
||||||
|
while pos < len(row):
|
||||||
|
match = re_multiwhite.search(row[pos:])
|
||||||
|
if match:
|
||||||
|
white_ranges.append(pos + match.start())
|
||||||
|
white_ranges.append(pos + match.end())
|
||||||
|
pos += match.end()
|
||||||
|
else:
|
||||||
|
white_ranges.append(len(row))
|
||||||
|
pos = len(row)
|
||||||
|
|
||||||
|
|
||||||
|
row_result = []
|
||||||
|
white_iter = iter(white_ranges)
|
||||||
|
while white_iter:
|
||||||
|
try:
|
||||||
|
start = white_iter.next()
|
||||||
|
end = white_iter.next()
|
||||||
|
if start != end:
|
||||||
|
row_result.append(
|
||||||
|
(start, row[start:end])
|
||||||
|
)
|
||||||
|
|
||||||
|
except StopIteration:
|
||||||
|
white_iter = None
|
||||||
|
print white_ranges
|
||||||
|
yield row_result
|
||||||
|
result.append(row_result)
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
row_result = []
|
row_result = []
|
||||||
pos = 0
|
pos = 0
|
||||||
while pos < len(row):
|
while pos < len(row):
|
||||||
|
@ -61,9 +125,56 @@ class PDFRecordFinder(object):
|
||||||
else:
|
else:
|
||||||
if match:
|
if match:
|
||||||
pos += match.end()
|
pos += match.end()
|
||||||
row_result.append((pos,row[pos:],'b'))
|
row_result.append((pos,row[pos:]))
|
||||||
pos += len(row)
|
pos += len(row)
|
||||||
|
|
||||||
result.append(row_result)
|
result.append(row_result)
|
||||||
row_result = []
|
row_result = []
|
||||||
return result
|
"""
|
||||||
|
#return result
|
||||||
|
|
||||||
|
|
||||||
|
class UnknownColumn(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class IsNextRecord(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class ColumnCollector(object):
|
||||||
|
def __init__(self, initial=None):
|
||||||
|
self.data = None
|
||||||
|
pass
|
||||||
|
|
||||||
|
def add(self, data):
|
||||||
|
if not self.data:
|
||||||
|
self.data = dict(data)
|
||||||
|
else:
|
||||||
|
if self.is_next_record(data):
|
||||||
|
raise IsNextRecord()
|
||||||
|
for col_id, value in data:
|
||||||
|
self.merge_column(col_id, value)
|
||||||
|
|
||||||
|
def merge_column(self, col_id, value):
|
||||||
|
if col_id in self.data.keys():
|
||||||
|
self.data[col_id] += ' ' + value.strip()
|
||||||
|
|
||||||
|
else:
|
||||||
|
# try adding a wiggle room value?
|
||||||
|
raise UnknownColumn
|
||||||
|
|
||||||
|
def is_next_record(self, data):
|
||||||
|
"""
|
||||||
|
If the first key value contains a string
|
||||||
|
and we already have some data in the record,
|
||||||
|
then this row is probably the beginning of
|
||||||
|
the next field. Raise an exception and continue
|
||||||
|
on with a fresh ColumnCollector.
|
||||||
|
"""
|
||||||
|
first_key = dict(data).keys()[0]
|
||||||
|
if self.data:
|
||||||
|
print self.data.keys()[0], first_key
|
||||||
|
return self.data.keys()[0] == first_key
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue