trying new header location method
This commit is contained in:
parent
46755dd90d
commit
6e1d02db8d
1 changed files with 53 additions and 20 deletions
|
@ -11,28 +11,49 @@ import pdb
|
||||||
class PDFRecordFinder(object):
|
class PDFRecordFinder(object):
|
||||||
def __init__(self, src, heading_exp=None):
|
def __init__(self, src, heading_exp=None):
|
||||||
if not heading_exp:
|
if not heading_exp:
|
||||||
heading_exp = re.compile('\s+Record Name: (.*)')
|
heading_exp = re.compile('(\s+Record Name: (.*)|\s+(.*Record Layout.*)')
|
||||||
|
|
||||||
|
field_heading_exp = re.compile('^Field.*Field.*Length.*Descrition')
|
||||||
|
|
||||||
opts = ["pdftotext", "-layout", "-nopgbrk", "-eol", "unix", src, '-']
|
opts = ["pdftotext", "-layout", "-nopgbrk", "-eol", "unix", src, '-']
|
||||||
pdftext = subprocess.check_output(opts)
|
pdftext = subprocess.check_output(opts)
|
||||||
self.textrows = pdftext.split('\n')
|
self.textrows = pdftext.split('\n')
|
||||||
self.heading_exp = heading_exp
|
self.heading_exp = heading_exp
|
||||||
|
self.field_heading_exp = field_heading_exp
|
||||||
|
|
||||||
def records(self):
|
def records(self):
|
||||||
headings = self.locate_heading_rows()
|
#headings = self.locate_heading_rows()
|
||||||
|
headings = self.locate_heading_rows_by_field()
|
||||||
|
|
||||||
for (start, end, name) in headings:
|
for (start, end, name) in headings:
|
||||||
name = name.decode('ascii', 'ignore')
|
name = name.decode('ascii', 'ignore')
|
||||||
yield (name, list(self.find_fields(iter(self.textrows[start+1:end]))))
|
yield (name, list(self.find_fields(iter(self.textrows[start+1:end]))))
|
||||||
|
|
||||||
|
|
||||||
|
def locate_heading_rows_by_field(self):
|
||||||
|
results = []
|
||||||
|
for (i, row) in enumerate(self.textrows):
|
||||||
|
match = self.field_heading_exp.match(row)
|
||||||
|
if match:
|
||||||
|
# work backwards until we think the header is fully copied
|
||||||
|
space_count_exp = re.compile('^(\s*)')
|
||||||
|
position = i - 1
|
||||||
|
last_spaces = space_count_exp.search(self.textrows[position]
|
||||||
|
complete = False
|
||||||
|
while not complete:
|
||||||
|
position -= 1
|
||||||
|
spaces = space_count_exp.search(self.textrows[position])
|
||||||
|
if spaces > last_spaces:
|
||||||
|
print 'HEADER', self.textrows[position:i]
|
||||||
|
complete = True
|
||||||
|
|
||||||
|
|
||||||
def locate_heading_rows(self):
|
def locate_heading_rows(self):
|
||||||
results = []
|
results = []
|
||||||
for (i, row) in enumerate(self.textrows):
|
for (i, row) in enumerate(self.textrows):
|
||||||
match = self.heading_exp.match(row)
|
match = self.heading_exp.match(row)
|
||||||
if match:
|
if match:
|
||||||
print i,match.groups()
|
#print i,match.groups()
|
||||||
#if not ''.join(match.groups()).lower().endswith('(continued)'):
|
|
||||||
results.append((i, ''.join(match.groups())))
|
results.append((i, ''.join(match.groups())))
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
@ -49,6 +70,17 @@ class PDFRecordFinder(object):
|
||||||
|
|
||||||
return merged
|
return merged
|
||||||
|
|
||||||
|
def locate_layout_block_rows(self):
|
||||||
|
"""
|
||||||
|
Search for rows that contain "Record Layout", as these are not fields
|
||||||
|
we are interested in because they contain the crazy blocks of field definitions
|
||||||
|
and not the nice 4-column ones that we're looking for."""
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for (i, row) in enumerate(self.textrows):
|
||||||
|
match = re.match("Record Layout", row)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def find_fields(self, row_iter):
|
def find_fields(self, row_iter):
|
||||||
cc = ColumnCollector()
|
cc = ColumnCollector()
|
||||||
|
@ -59,9 +91,9 @@ class PDFRecordFinder(object):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if cc.is_next_field(row):
|
if cc.is_next_field(row):
|
||||||
if row[1][1] == 'Vendor Indicator':
|
#if row[1][1] == 'Vendor Indicator':
|
||||||
import pdb
|
# import pdb
|
||||||
pdb.set_trace()
|
# pdb.set_trace()
|
||||||
yield cc
|
yield cc
|
||||||
cc = ColumnCollector()
|
cc = ColumnCollector()
|
||||||
|
|
||||||
|
@ -106,6 +138,7 @@ class PDFRecordFinder(object):
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
white_iter = None
|
white_iter = None
|
||||||
|
|
||||||
|
#print row_result
|
||||||
return row_result
|
return row_result
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue