trying new header location method
This commit is contained in:
parent
46755dd90d
commit
6e1d02db8d
1 changed files with 53 additions and 20 deletions
|
@ -11,30 +11,51 @@ import pdb
|
|||
class PDFRecordFinder(object):
|
||||
def __init__(self, src, heading_exp=None):
|
||||
if not heading_exp:
|
||||
heading_exp = re.compile('\s+Record Name: (.*)')
|
||||
heading_exp = re.compile('(\s+Record Name: (.*)|\s+(.*Record Layout.*)')
|
||||
|
||||
field_heading_exp = re.compile('^Field.*Field.*Length.*Descrition')
|
||||
|
||||
opts = ["pdftotext", "-layout", "-nopgbrk", "-eol", "unix", src, '-']
|
||||
pdftext = subprocess.check_output(opts)
|
||||
self.textrows = pdftext.split('\n')
|
||||
self.heading_exp = heading_exp
|
||||
|
||||
self.field_heading_exp = field_heading_exp
|
||||
|
||||
def records(self):
|
||||
headings = self.locate_heading_rows()
|
||||
#headings = self.locate_heading_rows()
|
||||
headings = self.locate_heading_rows_by_field()
|
||||
|
||||
for (start, end, name) in headings:
|
||||
name = name.decode('ascii', 'ignore')
|
||||
yield (name, list(self.find_fields(iter(self.textrows[start+1:end]))))
|
||||
|
||||
|
||||
def locate_heading_rows_by_field(self):
|
||||
results = []
|
||||
for (i, row) in enumerate(self.textrows):
|
||||
match = self.field_heading_exp.match(row)
|
||||
if match:
|
||||
# work backwards until we think the header is fully copied
|
||||
space_count_exp = re.compile('^(\s*)')
|
||||
position = i - 1
|
||||
last_spaces = space_count_exp.search(self.textrows[position]
|
||||
complete = False
|
||||
while not complete:
|
||||
position -= 1
|
||||
spaces = space_count_exp.search(self.textrows[position])
|
||||
if spaces > last_spaces:
|
||||
print 'HEADER', self.textrows[position:i]
|
||||
complete = True
|
||||
|
||||
|
||||
def locate_heading_rows(self):
|
||||
results = []
|
||||
for (i, row) in enumerate(self.textrows):
|
||||
match = self.heading_exp.match(row)
|
||||
if match:
|
||||
print i,match.groups()
|
||||
#if not ''.join(match.groups()).lower().endswith('(continued)'):
|
||||
#print i,match.groups()
|
||||
results.append((i, ''.join(match.groups())))
|
||||
|
||||
|
||||
"""
|
||||
results2 = []
|
||||
for r in results:
|
||||
|
@ -49,25 +70,36 @@ class PDFRecordFinder(object):
|
|||
|
||||
return merged
|
||||
|
||||
def locate_layout_block_rows(self):
|
||||
"""
|
||||
Search for rows that contain "Record Layout", as these are not fields
|
||||
we are interested in because they contain the crazy blocks of field definitions
|
||||
and not the nice 4-column ones that we're looking for."""
|
||||
|
||||
results = []
|
||||
for (i, row) in enumerate(self.textrows):
|
||||
match = re.match("Record Layout", row)
|
||||
|
||||
|
||||
|
||||
def find_fields(self, row_iter):
|
||||
cc = ColumnCollector()
|
||||
for r in row_iter:
|
||||
row = self.extract_columns_from_row(r.decode('UTF-8'))
|
||||
|
||||
|
||||
if not row:
|
||||
continue
|
||||
|
||||
|
||||
if cc.is_next_field(row):
|
||||
if row[1][1] == 'Vendor Indicator':
|
||||
import pdb
|
||||
pdb.set_trace()
|
||||
#if row[1][1] == 'Vendor Indicator':
|
||||
# import pdb
|
||||
# pdb.set_trace()
|
||||
yield cc
|
||||
cc = ColumnCollector()
|
||||
|
||||
|
||||
try:
|
||||
cc.add(row)
|
||||
|
||||
|
||||
except UnknownColumn, e:
|
||||
raise StopIteration
|
||||
yield cc
|
||||
|
@ -102,13 +134,14 @@ class PDFRecordFinder(object):
|
|||
row_result.append(
|
||||
(start, row[start:end].encode('ascii','ignore'))
|
||||
)
|
||||
|
||||
|
||||
except StopIteration:
|
||||
white_iter = None
|
||||
|
||||
#print row_result
|
||||
return row_result
|
||||
|
||||
|
||||
|
||||
|
||||
class UnknownColumn(Exception):
|
||||
pass
|
||||
|
||||
|
@ -128,12 +161,12 @@ class ColumnCollector(object):
|
|||
raise IsNextField()
|
||||
for col_id, value in data:
|
||||
self.merge_column(col_id, value)
|
||||
|
||||
|
||||
def merge_column(self, col_id, value):
|
||||
if col_id in self.data.keys():
|
||||
self.data[col_id] += ' ' + value.strip()
|
||||
|
||||
else:
|
||||
|
||||
else:
|
||||
# try adding a wiggle room value?
|
||||
# FIXME:
|
||||
# Sometimes description columns contain column-like
|
||||
|
@ -143,7 +176,7 @@ class ColumnCollector(object):
|
|||
# max column?
|
||||
|
||||
raise UnknownColumn
|
||||
|
||||
|
||||
def is_next_field(self, data):
|
||||
"""
|
||||
If the first key value contains a string
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue