Changed the way records are found by searching for field headers and then working
backwards to determine the record name. We also added the ability to "break" from reading a series of field definitions based on certain break points such as "Record Layout". There is currently an error in p1220 line 2704 which is caused by the column data starting on the 4th column "Description and Remarks". If ColumnCollectors started with the field titles, and had awareness of the column positions starting with those, it may be possible to at least read the following record fields without auto-adjusting them.
This commit is contained in:
parent
8995f142e5
commit
6e4a975cfb
1 changed files with 42 additions and 11 deletions
|
@ -11,9 +11,9 @@ import pdb
|
||||||
class PDFRecordFinder(object):
|
class PDFRecordFinder(object):
|
||||||
def __init__(self, src, heading_exp=None):
|
def __init__(self, src, heading_exp=None):
|
||||||
if not heading_exp:
|
if not heading_exp:
|
||||||
heading_exp = re.compile('(\s+Record Name: (.*)|\s+(.*Record Layout.*)')
|
heading_exp = re.compile('\s+Record Name: (.*)')
|
||||||
|
|
||||||
field_heading_exp = re.compile('^Field.*Field.*Length.*Descrition')
|
field_heading_exp = re.compile('^Field.*Field.*Length.*Description')
|
||||||
|
|
||||||
opts = ["pdftotext", "-layout", "-nopgbrk", "-eol", "unix", src, '-']
|
opts = ["pdftotext", "-layout", "-nopgbrk", "-eol", "unix", src, '-']
|
||||||
pdftext = subprocess.check_output(opts)
|
pdftext = subprocess.check_output(opts)
|
||||||
|
@ -27,26 +27,57 @@ class PDFRecordFinder(object):
|
||||||
|
|
||||||
for (start, end, name) in headings:
|
for (start, end, name) in headings:
|
||||||
name = name.decode('ascii', 'ignore')
|
name = name.decode('ascii', 'ignore')
|
||||||
yield (name, list(self.find_fields(iter(self.textrows[start+1:end]))))
|
yield (name, list(self.find_fields(iter(self.textrows[start+1:end]))), (start+1, end))
|
||||||
|
|
||||||
|
|
||||||
def locate_heading_rows_by_field(self):
|
def locate_heading_rows_by_field(self):
|
||||||
results = []
|
results = []
|
||||||
|
record_break = []
|
||||||
for (i, row) in enumerate(self.textrows):
|
for (i, row) in enumerate(self.textrows):
|
||||||
match = self.field_heading_exp.match(row)
|
match = self.field_heading_exp.match(row)
|
||||||
if match:
|
if match:
|
||||||
# work backwards until we think the header is fully copied
|
# work backwards until we think the header is fully copied
|
||||||
space_count_exp = re.compile('^(\s*)')
|
space_count_exp = re.compile('^(\s*)')
|
||||||
position = i - 1
|
position = i - 1
|
||||||
last_spaces = space_count_exp.search(self.textrows[position]
|
last_spaces = 10000
|
||||||
complete = False
|
complete = False
|
||||||
|
header = None
|
||||||
while not complete:
|
while not complete:
|
||||||
position -= 1
|
if len(self.textrows[position].strip()) == 0:
|
||||||
spaces = space_count_exp.search(self.textrows[position])
|
spaces = 10000
|
||||||
if spaces > last_spaces:
|
else:
|
||||||
print 'HEADER', self.textrows[position:i]
|
spaces = space_count_exp.search(self.textrows[position]).end()
|
||||||
complete = True
|
|
||||||
|
|
||||||
|
if spaces > last_spaces:
|
||||||
|
header = self.textrows[position + 1:i]
|
||||||
|
complete = True
|
||||||
|
last_spaces = spaces
|
||||||
|
position -= 1
|
||||||
|
|
||||||
|
name = ''.join(header).strip().decode('ascii','ignore')
|
||||||
|
results.append((i, name))
|
||||||
|
else:
|
||||||
|
# See if this row forces us to break from field reading.
|
||||||
|
if re.search('Record\ Layout', row):
|
||||||
|
record_break.append(i)
|
||||||
|
|
||||||
|
merged = []
|
||||||
|
for (a, b) in zip(results, results[1:] + [(len(self.textrows), None)]):
|
||||||
|
end_pos = None
|
||||||
|
|
||||||
|
print a[0], record_break[0], b[0]-1
|
||||||
|
|
||||||
|
while record_break and record_break[0] < a[0]:
|
||||||
|
record_break = record_break[1:]
|
||||||
|
|
||||||
|
if record_break[0] < b[0]-1:
|
||||||
|
end_pos = record_break[0]
|
||||||
|
record_break = record_break[1:]
|
||||||
|
else:
|
||||||
|
end_pos = b[0]-1
|
||||||
|
|
||||||
|
merged.append( (a[0], end_pos, a[1]) )
|
||||||
|
return merged
|
||||||
|
|
||||||
def locate_heading_rows(self):
|
def locate_heading_rows(self):
|
||||||
results = []
|
results = []
|
||||||
|
@ -175,8 +206,8 @@ class ColumnCollector(object):
|
||||||
return "<%s: %s>" % (self.__class__.__name__, map(lambda x:x if len(x) < 25 else x[:25] + '..', self.data.values()))
|
return "<%s: %s>" % (self.__class__.__name__, map(lambda x:x if len(x) < 25 else x[:25] + '..', self.data.values()))
|
||||||
|
|
||||||
def add(self, data):
|
def add(self, data):
|
||||||
if self.empty_rows > 2:
|
#if self.empty_rows > 2:
|
||||||
raise IsNextField()
|
# raise IsNextField()
|
||||||
|
|
||||||
if not self.data:
|
if not self.data:
|
||||||
self.data = dict(data)
|
self.data = dict(data)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue