trying new header location method

This commit is contained in:
Binh 2012-12-04 14:54:10 -06:00
parent 46755dd90d
commit 6e1d02db8d

View file

@ -11,30 +11,51 @@ import pdb
class PDFRecordFinder(object): class PDFRecordFinder(object):
def __init__(self, src, heading_exp=None): def __init__(self, src, heading_exp=None):
if not heading_exp: if not heading_exp:
heading_exp = re.compile('\s+Record Name: (.*)') heading_exp = re.compile('(\s+Record Name: (.*)|\s+(.*Record Layout.*)')
field_heading_exp = re.compile('^Field.*Field.*Length.*Descrition')
opts = ["pdftotext", "-layout", "-nopgbrk", "-eol", "unix", src, '-'] opts = ["pdftotext", "-layout", "-nopgbrk", "-eol", "unix", src, '-']
pdftext = subprocess.check_output(opts) pdftext = subprocess.check_output(opts)
self.textrows = pdftext.split('\n') self.textrows = pdftext.split('\n')
self.heading_exp = heading_exp self.heading_exp = heading_exp
self.field_heading_exp = field_heading_exp
def records(self): def records(self):
headings = self.locate_heading_rows() #headings = self.locate_heading_rows()
headings = self.locate_heading_rows_by_field()
for (start, end, name) in headings: for (start, end, name) in headings:
name = name.decode('ascii', 'ignore') name = name.decode('ascii', 'ignore')
yield (name, list(self.find_fields(iter(self.textrows[start+1:end])))) yield (name, list(self.find_fields(iter(self.textrows[start+1:end]))))
def locate_heading_rows_by_field(self):
results = []
for (i, row) in enumerate(self.textrows):
match = self.field_heading_exp.match(row)
if match:
# work backwards until we think the header is fully copied
space_count_exp = re.compile('^(\s*)')
position = i - 1
last_spaces = space_count_exp.search(self.textrows[position]
complete = False
while not complete:
position -= 1
spaces = space_count_exp.search(self.textrows[position])
if spaces > last_spaces:
print 'HEADER', self.textrows[position:i]
complete = True
def locate_heading_rows(self): def locate_heading_rows(self):
results = [] results = []
for (i, row) in enumerate(self.textrows): for (i, row) in enumerate(self.textrows):
match = self.heading_exp.match(row) match = self.heading_exp.match(row)
if match: if match:
print i,match.groups() #print i,match.groups()
#if not ''.join(match.groups()).lower().endswith('(continued)'):
results.append((i, ''.join(match.groups()))) results.append((i, ''.join(match.groups())))
""" """
results2 = [] results2 = []
for r in results: for r in results:
@ -49,25 +70,36 @@ class PDFRecordFinder(object):
return merged return merged
def locate_layout_block_rows(self):
"""
Search for rows that contain "Record Layout", as these are not fields
we are interested in because they contain the crazy blocks of field definitions
and not the nice 4-column ones that we're looking for."""
results = []
for (i, row) in enumerate(self.textrows):
match = re.match("Record Layout", row)
def find_fields(self, row_iter): def find_fields(self, row_iter):
cc = ColumnCollector() cc = ColumnCollector()
for r in row_iter: for r in row_iter:
row = self.extract_columns_from_row(r.decode('UTF-8')) row = self.extract_columns_from_row(r.decode('UTF-8'))
if not row: if not row:
continue continue
if cc.is_next_field(row): if cc.is_next_field(row):
if row[1][1] == 'Vendor Indicator': #if row[1][1] == 'Vendor Indicator':
import pdb # import pdb
pdb.set_trace() # pdb.set_trace()
yield cc yield cc
cc = ColumnCollector() cc = ColumnCollector()
try: try:
cc.add(row) cc.add(row)
except UnknownColumn, e: except UnknownColumn, e:
raise StopIteration raise StopIteration
yield cc yield cc
@ -102,13 +134,14 @@ class PDFRecordFinder(object):
row_result.append( row_result.append(
(start, row[start:end].encode('ascii','ignore')) (start, row[start:end].encode('ascii','ignore'))
) )
except StopIteration: except StopIteration:
white_iter = None white_iter = None
#print row_result
return row_result return row_result
class UnknownColumn(Exception): class UnknownColumn(Exception):
pass pass
@ -128,12 +161,12 @@ class ColumnCollector(object):
raise IsNextField() raise IsNextField()
for col_id, value in data: for col_id, value in data:
self.merge_column(col_id, value) self.merge_column(col_id, value)
def merge_column(self, col_id, value): def merge_column(self, col_id, value):
if col_id in self.data.keys(): if col_id in self.data.keys():
self.data[col_id] += ' ' + value.strip() self.data[col_id] += ' ' + value.strip()
else: else:
# try adding a wiggle room value? # try adding a wiggle room value?
# FIXME: # FIXME:
# Sometimes description columns contain column-like # Sometimes description columns contain column-like
@ -143,7 +176,7 @@ class ColumnCollector(object):
# max column? # max column?
raise UnknownColumn raise UnknownColumn
def is_next_field(self, data): def is_next_field(self, data):
""" """
If the first key value contains a string If the first key value contains a string