improved record detection, state records are now found
This commit is contained in:
parent
8cf78b5336
commit
e8e57bb932
1 changed files with 16 additions and 15 deletions
|
@ -11,7 +11,7 @@ import pdb
|
|||
class PDFRecordFinder(object):
|
||||
def __init__(self, src, heading_exp=None):
|
||||
if not heading_exp:
|
||||
heading_exp = re.compile('\s+Record Name: (.*)')
|
||||
heading_exp = re.compile('(\s+Record Name: (.*))|Record\ Layout')
|
||||
|
||||
field_heading_exp = re.compile('^Field.*Field.*Length.*Description')
|
||||
|
||||
|
@ -35,28 +35,29 @@ class PDFRecordFinder(object):
|
|||
def locate_heading_rows_by_field(self):
|
||||
results = []
|
||||
record_break = []
|
||||
line_is_whitespace_exp = re.compile('^(\s*)$')
|
||||
record_begin_exp = self.heading_exp #re.compile('Record\ Name')
|
||||
|
||||
for (i, row) in enumerate(self.textrows):
|
||||
match = self.field_heading_exp.match(row)
|
||||
if match:
|
||||
# work backwards until we think the header is fully copied
|
||||
space_count_exp = re.compile('^(\s*)')
|
||||
position = i - 1
|
||||
last_spaces = 10000
|
||||
spaces = 0
|
||||
#last_spaces = 10000
|
||||
complete = False
|
||||
header = None
|
||||
while not complete:
|
||||
if len(self.textrows[position].strip()) == 0:
|
||||
spaces = 10000
|
||||
else:
|
||||
spaces = space_count_exp.search(self.textrows[position]).end()
|
||||
|
||||
if spaces > last_spaces:
|
||||
header = self.textrows[position + 1:i]
|
||||
line_is_whitespace = True if line_is_whitespace_exp.match(self.textrows[position]) else False
|
||||
is_record_begin = record_begin_exp.search(self.textrows[position])
|
||||
if is_record_begin or line_is_whitespace:
|
||||
header = self.textrows[position-1:i]
|
||||
complete = True
|
||||
last_spaces = spaces
|
||||
position -= 1
|
||||
|
||||
name = ''.join(header).strip().decode('ascii','ignore')
|
||||
print (name, position)
|
||||
results.append((i, name, position))
|
||||
else:
|
||||
# See if this row forces us to break from field reading.
|
||||
|
@ -81,6 +82,7 @@ class PDFRecordFinder(object):
|
|||
merged.append( (a[0], end_pos-1, a[1]) )
|
||||
return merged
|
||||
|
||||
"""
|
||||
def locate_heading_rows(self):
|
||||
results = []
|
||||
for (i, row) in enumerate(self.textrows):
|
||||
|
@ -95,16 +97,15 @@ class PDFRecordFinder(object):
|
|||
return merged
|
||||
|
||||
def locate_layout_block_rows(self):
|
||||
"""
|
||||
Search for rows that contain "Record Layout", as these are not fields
|
||||
we are interested in because they contain the crazy blocks of field definitions
|
||||
and not the nice 4-column ones that we're looking for."""
|
||||
# Search for rows that contain "Record Layout", as these are not fields
|
||||
# we are interested in because they contain the crazy blocks of field definitions
|
||||
# and not the nice 4-column ones that we're looking for.
|
||||
|
||||
results = []
|
||||
for (i, row) in enumerate(self.textrows):
|
||||
match = re.match("Record Layout", row)
|
||||
|
||||
|
||||
"""
|
||||
|
||||
def find_fields(self, row_iter):
|
||||
cc = ColumnCollector()
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue