improved record detection, state records are now found

This commit is contained in:
Binh 2013-03-26 13:23:48 -05:00
parent 8cf78b5336
commit e8e57bb932

View file

@ -11,7 +11,7 @@ import pdb
class PDFRecordFinder(object): class PDFRecordFinder(object):
def __init__(self, src, heading_exp=None): def __init__(self, src, heading_exp=None):
if not heading_exp: if not heading_exp:
heading_exp = re.compile('\s+Record Name: (.*)') heading_exp = re.compile('(\s+Record Name: (.*))|Record\ Layout')
field_heading_exp = re.compile('^Field.*Field.*Length.*Description') field_heading_exp = re.compile('^Field.*Field.*Length.*Description')
@ -35,28 +35,29 @@ class PDFRecordFinder(object):
def locate_heading_rows_by_field(self): def locate_heading_rows_by_field(self):
results = [] results = []
record_break = [] record_break = []
line_is_whitespace_exp = re.compile('^(\s*)$')
record_begin_exp = self.heading_exp #re.compile('Record\ Name')
for (i, row) in enumerate(self.textrows): for (i, row) in enumerate(self.textrows):
match = self.field_heading_exp.match(row) match = self.field_heading_exp.match(row)
if match: if match:
# work backwards until we think the header is fully copied # work backwards until we think the header is fully copied
space_count_exp = re.compile('^(\s*)') space_count_exp = re.compile('^(\s*)')
position = i - 1 position = i - 1
last_spaces = 10000 spaces = 0
#last_spaces = 10000
complete = False complete = False
header = None header = None
while not complete: while not complete:
if len(self.textrows[position].strip()) == 0: line_is_whitespace = True if line_is_whitespace_exp.match(self.textrows[position]) else False
spaces = 10000 is_record_begin = record_begin_exp.search(self.textrows[position])
else: if is_record_begin or line_is_whitespace:
spaces = space_count_exp.search(self.textrows[position]).end() header = self.textrows[position-1:i]
if spaces > last_spaces:
header = self.textrows[position + 1:i]
complete = True complete = True
last_spaces = spaces
position -= 1 position -= 1
name = ''.join(header).strip().decode('ascii','ignore') name = ''.join(header).strip().decode('ascii','ignore')
print (name, position)
results.append((i, name, position)) results.append((i, name, position))
else: else:
# See if this row forces us to break from field reading. # See if this row forces us to break from field reading.
@ -81,6 +82,7 @@ class PDFRecordFinder(object):
merged.append( (a[0], end_pos-1, a[1]) ) merged.append( (a[0], end_pos-1, a[1]) )
return merged return merged
"""
def locate_heading_rows(self): def locate_heading_rows(self):
results = [] results = []
for (i, row) in enumerate(self.textrows): for (i, row) in enumerate(self.textrows):
@ -95,16 +97,15 @@ class PDFRecordFinder(object):
return merged return merged
def locate_layout_block_rows(self): def locate_layout_block_rows(self):
""" # Search for rows that contain "Record Layout", as these are not fields
Search for rows that contain "Record Layout", as these are not fields # we are interested in because they contain the crazy blocks of field definitions
we are interested in because they contain the crazy blocks of field definitions # and not the nice 4-column ones that we're looking for.
and not the nice 4-column ones that we're looking for."""
results = [] results = []
for (i, row) in enumerate(self.textrows): for (i, row) in enumerate(self.textrows):
match = re.match("Record Layout", row) match = re.match("Record Layout", row)
"""
def find_fields(self, row_iter): def find_fields(self, row_iter):
cc = ColumnCollector() cc = ColumnCollector()