trying new header location method
This commit is contained in:
parent
46755dd90d
commit
6e1d02db8d
1 changed files with 53 additions and 20 deletions
|
@ -11,28 +11,49 @@ import pdb
|
|||
class PDFRecordFinder(object):
|
||||
def __init__(self, src, heading_exp=None):
|
||||
if not heading_exp:
|
||||
heading_exp = re.compile('\s+Record Name: (.*)')
|
||||
heading_exp = re.compile('(\s+Record Name: (.*)|\s+(.*Record Layout.*)')
|
||||
|
||||
field_heading_exp = re.compile('^Field.*Field.*Length.*Descrition')
|
||||
|
||||
opts = ["pdftotext", "-layout", "-nopgbrk", "-eol", "unix", src, '-']
|
||||
pdftext = subprocess.check_output(opts)
|
||||
self.textrows = pdftext.split('\n')
|
||||
self.heading_exp = heading_exp
|
||||
|
||||
self.field_heading_exp = field_heading_exp
|
||||
|
||||
def records(self):
|
||||
headings = self.locate_heading_rows()
|
||||
#headings = self.locate_heading_rows()
|
||||
headings = self.locate_heading_rows_by_field()
|
||||
|
||||
for (start, end, name) in headings:
|
||||
name = name.decode('ascii', 'ignore')
|
||||
yield (name, list(self.find_fields(iter(self.textrows[start+1:end]))))
|
||||
|
||||
|
||||
def locate_heading_rows_by_field(self):
|
||||
results = []
|
||||
for (i, row) in enumerate(self.textrows):
|
||||
match = self.field_heading_exp.match(row)
|
||||
if match:
|
||||
# work backwards until we think the header is fully copied
|
||||
space_count_exp = re.compile('^(\s*)')
|
||||
position = i - 1
|
||||
last_spaces = space_count_exp.search(self.textrows[position]
|
||||
complete = False
|
||||
while not complete:
|
||||
position -= 1
|
||||
spaces = space_count_exp.search(self.textrows[position])
|
||||
if spaces > last_spaces:
|
||||
print 'HEADER', self.textrows[position:i]
|
||||
complete = True
|
||||
|
||||
|
||||
def locate_heading_rows(self):
|
||||
results = []
|
||||
for (i, row) in enumerate(self.textrows):
|
||||
match = self.heading_exp.match(row)
|
||||
if match:
|
||||
print i,match.groups()
|
||||
#if not ''.join(match.groups()).lower().endswith('(continued)'):
|
||||
#print i,match.groups()
|
||||
results.append((i, ''.join(match.groups())))
|
||||
|
||||
"""
|
||||
|
@ -49,6 +70,17 @@ class PDFRecordFinder(object):
|
|||
|
||||
return merged
|
||||
|
||||
def locate_layout_block_rows(self):
|
||||
"""
|
||||
Search for rows that contain "Record Layout", as these are not fields
|
||||
we are interested in because they contain the crazy blocks of field definitions
|
||||
and not the nice 4-column ones that we're looking for."""
|
||||
|
||||
results = []
|
||||
for (i, row) in enumerate(self.textrows):
|
||||
match = re.match("Record Layout", row)
|
||||
|
||||
|
||||
|
||||
def find_fields(self, row_iter):
|
||||
cc = ColumnCollector()
|
||||
|
@ -59,9 +91,9 @@ class PDFRecordFinder(object):
|
|||
continue
|
||||
|
||||
if cc.is_next_field(row):
|
||||
if row[1][1] == 'Vendor Indicator':
|
||||
import pdb
|
||||
pdb.set_trace()
|
||||
#if row[1][1] == 'Vendor Indicator':
|
||||
# import pdb
|
||||
# pdb.set_trace()
|
||||
yield cc
|
||||
cc = ColumnCollector()
|
||||
|
||||
|
@ -106,6 +138,7 @@ class PDFRecordFinder(object):
|
|||
except StopIteration:
|
||||
white_iter = None
|
||||
|
||||
#print row_result
|
||||
return row_result
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue