316 lines
10 KiB
Python
316 lines
10 KiB
Python
#!/usr/bin/python
|
|
# coding=UTF-8
|
|
|
|
import subprocess
|
|
import re
|
|
import pdb
|
|
|
|
""" pdftotext -layout -nopgbrk p1220.pdf - """
|
|
|
|
|
|
class PDFRecordFinder(object):
|
|
def __init__(self, src, heading_exp=None):
|
|
if not heading_exp:
|
|
heading_exp = re.compile('(\s+Record Name: (.*))|Record\ Layout')
|
|
|
|
field_heading_exp = re.compile('^Field.*Field.*Length.*Description')
|
|
|
|
opts = ["pdftotext", "-layout", "-nopgbrk", "-eol", "unix", src, '-']
|
|
pdftext = subprocess.check_output(opts)
|
|
self.textrows = pdftext.split('\n')
|
|
self.heading_exp = heading_exp
|
|
self.field_heading_exp = field_heading_exp
|
|
|
|
def records(self):
|
|
headings = self.locate_heading_rows_by_field()
|
|
|
|
#for x in headings:
|
|
# print x
|
|
|
|
for (start, end, name) in headings:
|
|
name = name.decode('ascii', 'ignore')
|
|
yield (name, list(self.find_fields(iter(self.textrows[start+1:end]))), (start+1, end))
|
|
|
|
|
|
def locate_heading_rows_by_field(self):
|
|
results = []
|
|
record_break = []
|
|
line_is_whitespace_exp = re.compile('^(\s*)$')
|
|
record_begin_exp = self.heading_exp #re.compile('Record\ Name')
|
|
|
|
for (i, row) in enumerate(self.textrows):
|
|
match = self.field_heading_exp.match(row)
|
|
if match:
|
|
# work backwards until we think the header is fully copied
|
|
space_count_exp = re.compile('^(\s*)')
|
|
position = i - 1
|
|
spaces = 0
|
|
#last_spaces = 10000
|
|
complete = False
|
|
header = None
|
|
while not complete:
|
|
line_is_whitespace = True if line_is_whitespace_exp.match(self.textrows[position]) else False
|
|
is_record_begin = record_begin_exp.search(self.textrows[position])
|
|
if is_record_begin or line_is_whitespace:
|
|
header = self.textrows[position-1:i]
|
|
complete = True
|
|
position -= 1
|
|
|
|
name = ''.join(header).strip().decode('ascii','ignore')
|
|
print (name, position)
|
|
results.append((i, name, position))
|
|
else:
|
|
# See if this row forces us to break from field reading.
|
|
if re.search('Record\ Layout', row):
|
|
record_break.append(i)
|
|
|
|
merged = []
|
|
for (a, b) in zip(results, results[1:] + [(len(self.textrows), None)]):
|
|
end_pos = None
|
|
|
|
#print a[0], record_break[0], b[0]-1
|
|
|
|
while record_break and record_break[0] < a[0]:
|
|
record_break = record_break[1:]
|
|
|
|
if record_break[0] < b[0]-1:
|
|
end_pos = record_break[0]
|
|
record_break = record_break[1:]
|
|
else:
|
|
end_pos = b[0]-1
|
|
|
|
merged.append( (a[0], end_pos-1, a[1]) )
|
|
return merged
|
|
|
|
"""
|
|
def locate_heading_rows(self):
|
|
results = []
|
|
for (i, row) in enumerate(self.textrows):
|
|
match = self.heading_exp.match(row)
|
|
if match:
|
|
results.append((i, ''.join(match.groups())))
|
|
|
|
merged = []
|
|
for (a, b) in zip(results, results[1:] + [(len(self.textrows),None)]):
|
|
merged.append( (a[0], b[0]-1, a[1]) )
|
|
|
|
return merged
|
|
|
|
def locate_layout_block_rows(self):
|
|
# Search for rows that contain "Record Layout", as these are not fields
|
|
# we are interested in because they contain the crazy blocks of field definitions
|
|
# and not the nice 4-column ones that we're looking for.
|
|
|
|
results = []
|
|
for (i, row) in enumerate(self.textrows):
|
|
match = re.match("Record Layout", row)
|
|
|
|
"""
|
|
|
|
def find_fields(self, row_iter):
|
|
cc = ColumnCollector()
|
|
blank_row_counter = 0
|
|
|
|
for r in row_iter:
|
|
row = r.decode('UTF-8')
|
|
#print row
|
|
row_columns = self.extract_columns_from_row(row)
|
|
|
|
if not row_columns:
|
|
if cc.data and len(cc.data.keys()) > 1 and len(row.strip()) > cc.data.keys()[-1]:
|
|
yield cc
|
|
cc = ColumnCollector()
|
|
else:
|
|
cc.empty_row()
|
|
continue
|
|
|
|
try:
|
|
cc.add(row_columns)
|
|
|
|
except IsNextField, e:
|
|
yield cc
|
|
cc = ColumnCollector()
|
|
cc.add(row_columns)
|
|
except UnknownColumn, e:
|
|
raise StopIteration
|
|
|
|
yield cc
|
|
|
|
|
|
def extract_columns_from_row(self, row):
|
|
re_multiwhite = re.compile(r'\s{2,}')
|
|
|
|
# IF LINE DOESN'T CONTAIN MULTIPLE WHITESPACES, IT'S LIKELY NOT A TABLE
|
|
if not re_multiwhite.search(row):
|
|
return None
|
|
|
|
white_ranges = [0,]
|
|
pos = 0
|
|
while pos < len(row):
|
|
match = re_multiwhite.search(row[pos:])
|
|
if match:
|
|
white_ranges.append(pos + match.start())
|
|
white_ranges.append(pos + match.end())
|
|
pos += match.end()
|
|
else:
|
|
white_ranges.append(len(row))
|
|
pos = len(row)
|
|
|
|
row_result = []
|
|
white_iter = iter(white_ranges)
|
|
while white_iter:
|
|
try:
|
|
start = white_iter.next()
|
|
end = white_iter.next()
|
|
if start != end:
|
|
row_result.append(
|
|
(start, row[start:end].encode('ascii','ignore'))
|
|
)
|
|
|
|
except StopIteration:
|
|
white_iter = None
|
|
|
|
#print row_result
|
|
return row_result
|
|
|
|
|
|
class UnknownColumn(Exception):
|
|
pass
|
|
|
|
class IsNextField(Exception):
|
|
pass
|
|
|
|
class ColumnCollector(object):
|
|
def __init__(self, initial=None):
|
|
self.data = None
|
|
self.column_widths = None
|
|
self.max_data_length = 0
|
|
self.adjust_pad = 3
|
|
self.empty_rows = 0
|
|
pass
|
|
|
|
def __repr__(self):
|
|
return "<%s: %s>" % (
|
|
self.__class__.__name__,
|
|
map(lambda x:x if len(x) < 25 else x[:25] + '..',
|
|
self.data.values() if self.data else ''))
|
|
|
|
def add(self, data):
|
|
#if self.empty_rows > 2:
|
|
# raise IsNextField()
|
|
|
|
if not self.data:
|
|
self.data = dict(data)
|
|
else:
|
|
data = self.adjust_columns(data)
|
|
if self.is_next_field(data):
|
|
raise IsNextField()
|
|
for col_id, value in data:
|
|
self.merge_column(col_id, value)
|
|
|
|
self.update_column_widths(data)
|
|
|
|
def empty_row(self):
|
|
self.empty_rows += 1
|
|
|
|
def update_column_widths(self, data):
|
|
self.last_data_length = len(data)
|
|
self.max_data_length = max(self.max_data_length, len(data))
|
|
|
|
if not self.column_widths:
|
|
self.column_widths = dict(map(lambda (column, value): [column, column + len(value)], data))
|
|
else:
|
|
for col_id, value in data:
|
|
try:
|
|
self.column_widths[col_id] = max(self.column_widths[col_id], col_id + len(value.strip()))
|
|
except KeyError:
|
|
pass
|
|
|
|
def add_old(self, data):
|
|
if not self.data:
|
|
self.data = dict(data)
|
|
else:
|
|
if self.is_next_field(data):
|
|
raise IsNextField()
|
|
for col_id, value in data:
|
|
self.merge_column(col_id, value)
|
|
|
|
|
|
def adjust_columns(self, data):
|
|
adjusted_data = {}
|
|
for col_id, value in data:
|
|
if col_id in self.data.keys():
|
|
adjusted_data[col_id] = value.strip()
|
|
else:
|
|
for col_start, col_end in self.column_widths.items():
|
|
if (col_start - self.adjust_pad) <= col_id and (col_end + self.adjust_pad) >= col_id:
|
|
if col_start in adjusted_data:
|
|
adjusted_data[col_start] += ' ' + value.strip()
|
|
else:
|
|
adjusted_data[col_start] = value.strip()
|
|
|
|
return adjusted_data.items()
|
|
|
|
|
|
def merge_column(self, col_id, value):
|
|
if col_id in self.data.keys():
|
|
self.data[col_id] += ' ' + value.strip()
|
|
else:
|
|
# try adding a wiggle room value?
|
|
# FIXME:
|
|
# Sometimes description columns contain column-like
|
|
# layouts, and this causes the ColumnCollector to become
|
|
# confused. Perhaps we could check to see if a column occurs
|
|
# after the maximum column, and assume it's part of the
|
|
# max column?
|
|
|
|
"""
|
|
for col_start, col_end in self.column_widths.items():
|
|
if col_start <= col_id and (col_end) >= col_id:
|
|
self.data[col_start] += ' ' + value.strip()
|
|
return
|
|
"""
|
|
raise UnknownColumn
|
|
|
|
def is_next_field(self, data):
|
|
"""
|
|
If the first key value contains a string
|
|
and we already have some data in the record,
|
|
then this row is probably the beginning of
|
|
the next field. Raise an exception and continue
|
|
on with a fresh ColumnCollector.
|
|
"""
|
|
|
|
""" If the length of the value in column_id is less than the position of the next column_id,
|
|
then this is probably a continuation.
|
|
"""
|
|
|
|
if self.data and data:
|
|
keys = dict(self.column_widths).keys()
|
|
keys.sort()
|
|
keys += [None]
|
|
|
|
if self.last_data_length < len(data):
|
|
return True
|
|
|
|
first_key, first_value = dict(data).items()[0]
|
|
if self.data.keys()[0] == first_key:
|
|
|
|
position = keys.index(first_key)
|
|
max_length = keys[position + 1]
|
|
if max_length:
|
|
return len(first_value) > max_length or len(data) == self.max_data_length
|
|
|
|
return False
|
|
|
|
|
|
@property
|
|
def tuple(self):
|
|
#try:
|
|
if self.data:
|
|
return tuple(map(lambda k:self.data[k], sorted(self.data.keys())))
|
|
return ()
|
|
#except:
|
|
# import pdb
|
|
# pdb.set_trace()
|
|
|