Merge branch 'master' of brimstone.klowner.com:pyaccuwage
Conflicts: pyaccuwage/pdfextract.py
This commit is contained in:
commit
8995f142e5
3 changed files with 205 additions and 65 deletions
|
@ -39,7 +39,7 @@ class SimpleDefParser(object):
|
||||||
else:
|
else:
|
||||||
item = self._intify(item)
|
item = self._intify(item)
|
||||||
yield item
|
yield item
|
||||||
|
|
||||||
|
|
||||||
class LengthExpression(object):
|
class LengthExpression(object):
|
||||||
import operator
|
import operator
|
||||||
|
@ -115,14 +115,16 @@ class NumericToken(BaseToken):
|
||||||
@property
|
@property
|
||||||
def value(self):
|
def value(self):
|
||||||
return int(self._value)
|
return int(self._value)
|
||||||
|
|
||||||
|
|
||||||
class RecordBuilder(object):
|
class RecordBuilder(object):
|
||||||
import fields
|
import fields
|
||||||
|
|
||||||
|
entry_max_length = 4
|
||||||
|
|
||||||
TOKEN_TYPES = [
|
TOKEN_TYPES = [
|
||||||
RangeToken,
|
RangeToken,
|
||||||
NumericToken,
|
NumericToken,
|
||||||
StringToken,
|
StringToken,
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -134,7 +136,7 @@ class RecordBuilder(object):
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
}),
|
}),
|
||||||
|
|
||||||
(fields.MoneyField, {
|
(fields.MoneyField, {
|
||||||
'regexp': {
|
'regexp': {
|
||||||
'desc': [
|
'desc': [
|
||||||
|
@ -144,7 +146,7 @@ class RecordBuilder(object):
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
}),
|
}),
|
||||||
|
|
||||||
(fields.TextField, {
|
(fields.TextField, {
|
||||||
'regexp': {
|
'regexp': {
|
||||||
'desc': [
|
'desc': [
|
||||||
|
@ -171,13 +173,16 @@ class RecordBuilder(object):
|
||||||
entries = self._guess_field_types(entries)
|
entries = self._guess_field_types(entries)
|
||||||
entries = self._convert_to_records(entries)
|
entries = self._convert_to_records(entries)
|
||||||
return entries
|
return entries
|
||||||
|
|
||||||
|
|
||||||
def _compile(self, entries):
|
def _compile(self, entries):
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
|
|
||||||
(f_range, f_name, f_length, f_desc) = list(entry) + ['']*(4-len(entry))
|
if len(entry) > self.entry_max_length:
|
||||||
|
continue
|
||||||
|
|
||||||
|
(f_range, f_name, f_length, f_desc) = list(entry) + ['']*(self.entry_max_length-len(entry))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
f_length = int(f_length)
|
f_length = int(f_length)
|
||||||
except ValueError, e:
|
except ValueError, e:
|
||||||
|
@ -186,9 +191,11 @@ class RecordBuilder(object):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
assert f_length == RangeToken(f_range).value
|
assert f_length == RangeToken(f_range).value
|
||||||
except AssertionError:
|
except AssertionError, e:
|
||||||
import pdb
|
continue
|
||||||
pdb.set_trace()
|
except ValueError, e:
|
||||||
|
# bad result, skip
|
||||||
|
continue
|
||||||
|
|
||||||
name_parts = f_name.split(' ')
|
name_parts = f_name.split(' ')
|
||||||
|
|
||||||
|
@ -199,43 +206,45 @@ class RecordBuilder(object):
|
||||||
required = True
|
required = True
|
||||||
else:
|
else:
|
||||||
required = None
|
required = None
|
||||||
|
|
||||||
f_name = u'_'.join(map(lambda x:x.lower(), name_parts))
|
f_name = u'_'.join(map(lambda x:x.lower(), name_parts))
|
||||||
|
f_name = f_name.replace('&', 'and')
|
||||||
f_name = re.sub(r'[^\w]','', f_name)
|
f_name = re.sub(r'[^\w]','', f_name)
|
||||||
|
|
||||||
yield {
|
yield {
|
||||||
'name': f_name,
|
'name': f_name,
|
||||||
'desc': '(' + f_range + '). ' + f_desc,
|
'range': f_range,
|
||||||
|
'desc': f_desc,
|
||||||
'length': f_length,
|
'length': f_length,
|
||||||
'required': required,
|
'required': required,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def _guess_field_types(self, entries):
|
def _guess_field_types(self, entries):
|
||||||
lengthexp = LengthExpression()
|
lengthexp = LengthExpression()
|
||||||
|
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
matches = dict(map(lambda x:(x[0],0), self.FIELD_TYPES))
|
matches = dict(map(lambda x:(x[0],0), self.FIELD_TYPES))
|
||||||
|
|
||||||
for (classtype, criteria) in self.FIELD_TYPES:
|
for (classtype, criteria) in self.FIELD_TYPES:
|
||||||
if 'length' in criteria:
|
if 'length' in criteria:
|
||||||
if not lengthexp(int(entry['length']), criteria['length']):
|
if not lengthexp(int(entry['length']), criteria['length']):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if 'regexp' in criteria:
|
if 'regexp' in criteria:
|
||||||
for crit_key, crit_values in criteria['regexp'].items():
|
for crit_key, crit_values in criteria['regexp'].items():
|
||||||
for crit_re in crit_values:
|
for crit_re in crit_values:
|
||||||
matches[classtype] += 1 if crit_re.search(entry[crit_key]) else 0
|
matches[classtype] += 1 if crit_re.search(entry[crit_key]) else 0
|
||||||
|
|
||||||
|
|
||||||
matches = list(matches.items())
|
matches = list(matches.items())
|
||||||
matches.sort(key=lambda x:x[1])
|
matches.sort(key=lambda x:x[1])
|
||||||
|
|
||||||
matches_found = True if sum(map(lambda x:x[1], matches)) > 0 else False
|
matches_found = True if sum(map(lambda x:x[1], matches)) > 0 else False
|
||||||
|
|
||||||
entry['guessed_type'] = matches[-1][0] if matches_found else self.fields.TextField
|
entry['guessed_type'] = matches[-1][0] if matches_found else self.fields.TextField
|
||||||
yield entry
|
yield entry
|
||||||
|
|
||||||
def _convert_to_records(self, entries):
|
def _convert_to_records(self, entries):
|
||||||
blank_count = 1
|
blank_count = 1
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
|
@ -250,10 +259,10 @@ class RecordBuilder(object):
|
||||||
add(entry['name'].ljust(40))
|
add(entry['name'].ljust(40))
|
||||||
|
|
||||||
add(' = ')
|
add(' = ')
|
||||||
|
|
||||||
if entry['guessed_type']:
|
if entry['guessed_type']:
|
||||||
add(entry['guessed_type'].__name__)
|
add(entry['guessed_type'].__name__)
|
||||||
|
|
||||||
args = []
|
args = []
|
||||||
args.append("max_length=%d" % entry['length'])
|
args.append("max_length=%d" % entry['length'])
|
||||||
if entry['required'] != None:
|
if entry['required'] != None:
|
||||||
|
@ -261,13 +270,11 @@ class RecordBuilder(object):
|
||||||
|
|
||||||
add("(" + ", ".join(args) + ")")
|
add("(" + ", ".join(args) + ")")
|
||||||
|
|
||||||
|
yield "".join(result).ljust(85) + "# %s" % entry['range']
|
||||||
yield "".join(result)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class PastedDefParser(RecordBuilder):
|
class PastedDefParser(RecordBuilder):
|
||||||
|
|
||||||
def load(self, infile):
|
def load(self, infile):
|
||||||
tokens = self._tokenize(infile)
|
tokens = self._tokenize(infile)
|
||||||
entries = self._parse(tokens)
|
entries = self._parse(tokens)
|
||||||
|
@ -275,7 +282,7 @@ class PastedDefParser(RecordBuilder):
|
||||||
entries = self._guess_field_types(entries)
|
entries = self._guess_field_types(entries)
|
||||||
entries = self._convert_to_records(entries)
|
entries = self._convert_to_records(entries)
|
||||||
return entries
|
return entries
|
||||||
|
|
||||||
def _tokenize(self, data):
|
def _tokenize(self, data):
|
||||||
for item in data.replace('\n',' ').split(' '):
|
for item in data.replace('\n',' ').split(' '):
|
||||||
item = item.strip()
|
item = item.strip()
|
||||||
|
@ -299,7 +306,7 @@ class PastedDefParser(RecordBuilder):
|
||||||
current_length = None
|
current_length = None
|
||||||
current_desc = []
|
current_desc = []
|
||||||
state = 'range'
|
state = 'range'
|
||||||
|
|
||||||
byte_pos = None
|
byte_pos = None
|
||||||
|
|
||||||
# COLLECT TOKENS INTO GROUPS
|
# COLLECT TOKENS INTO GROUPS
|
||||||
|
@ -311,7 +318,7 @@ class PastedDefParser(RecordBuilder):
|
||||||
if byte_pos == None or token.value == byte_pos:
|
if byte_pos == None or token.value == byte_pos:
|
||||||
# UPDATE RANGE POSITION
|
# UPDATE RANGE POSITION
|
||||||
byte_pos = token.value + 1
|
byte_pos = token.value + 1
|
||||||
|
|
||||||
# CONVERT TOKEN INTO RangeToken
|
# CONVERT TOKEN INTO RangeToken
|
||||||
token = RangeToken("%d-%d" % (token.value, token.value))
|
token = RangeToken("%d-%d" % (token.value, token.value))
|
||||||
|
|
||||||
|
@ -323,7 +330,7 @@ class PastedDefParser(RecordBuilder):
|
||||||
#if byte_pos and token and state == 'desc' and token.start_position != byte_pos:
|
#if byte_pos and token and state == 'desc' and token.start_position != byte_pos:
|
||||||
# print token.start_position, byte_pos
|
# print token.start_position, byte_pos
|
||||||
# current_desc.append(token)
|
# current_desc.append(token)
|
||||||
|
|
||||||
if token and byte_pos and token.start_position != byte_pos:
|
if token and byte_pos and token.start_position != byte_pos:
|
||||||
state = 'desc'
|
state = 'desc'
|
||||||
|
|
||||||
|
@ -339,13 +346,13 @@ class PastedDefParser(RecordBuilder):
|
||||||
# UPDATE RANGE POSITION
|
# UPDATE RANGE POSITION
|
||||||
if token:
|
if token:
|
||||||
byte_pos = token.end_position + 1
|
byte_pos = token.end_position + 1
|
||||||
|
|
||||||
current_range = token
|
current_range = token
|
||||||
current_name = []
|
current_name = []
|
||||||
current_length = None
|
current_length = None
|
||||||
current_desc = []
|
current_desc = []
|
||||||
state = 'name'
|
state = 'name'
|
||||||
|
|
||||||
elif state == 'name':
|
elif state == 'name':
|
||||||
if isinstance(token, StringToken) and current_name and isinstance(current_name[-1], NumericToken):
|
if isinstance(token, StringToken) and current_name and isinstance(current_name[-1], NumericToken):
|
||||||
current_length = current_name.pop()
|
current_length = current_name.pop()
|
||||||
|
@ -361,9 +368,9 @@ class PastedDefParser(RecordBuilder):
|
||||||
def _compile(self, groups):
|
def _compile(self, groups):
|
||||||
for g in groups:
|
for g in groups:
|
||||||
assert g['byterange'].value == g['length'].value
|
assert g['byterange'].value == g['length'].value
|
||||||
|
|
||||||
desc = u' '.join(map(lambda x:unicode(x.value), g['desc']))
|
desc = u' '.join(map(lambda x:unicode(x.value), g['desc']))
|
||||||
|
|
||||||
if g['name'][-1].value.lower() == '(optional)':
|
if g['name'][-1].value.lower() == '(optional)':
|
||||||
g['name'] = g['name'][0:-1]
|
g['name'] = g['name'][0:-1]
|
||||||
required = False
|
required = False
|
||||||
|
@ -374,14 +381,14 @@ class PastedDefParser(RecordBuilder):
|
||||||
|
|
||||||
name = u'_'.join(map(lambda x:x.value.lower(), g['name']))
|
name = u'_'.join(map(lambda x:x.value.lower(), g['name']))
|
||||||
name = re.sub(r'[^\w]','', name)
|
name = re.sub(r'[^\w]','', name)
|
||||||
|
|
||||||
yield({
|
yield({
|
||||||
'name': name,
|
'name': name,
|
||||||
'desc': desc,
|
'desc': desc,
|
||||||
'length': g['byterange'].value,
|
'length': g['byterange'].value,
|
||||||
'required': required,
|
'required': required,
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def _guess_field_types(self, entries):
|
def _guess_field_types(self, entries):
|
||||||
|
@ -389,26 +396,26 @@ class PastedDefParser(RecordBuilder):
|
||||||
|
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
matches = dict(map(lambda x:(x[0],0), self.FIELD_TYPES))
|
matches = dict(map(lambda x:(x[0],0), self.FIELD_TYPES))
|
||||||
|
|
||||||
for (classtype, criteria) in self.FIELD_TYPES:
|
for (classtype, criteria) in self.FIELD_TYPES:
|
||||||
if 'length' in criteria:
|
if 'length' in criteria:
|
||||||
if not lengthexp(int(entry['length']), criteria['length']):
|
if not lengthexp(int(entry['length']), criteria['length']):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if 'regexp' in criteria:
|
if 'regexp' in criteria:
|
||||||
for crit_key, crit_values in criteria['regexp'].items():
|
for crit_key, crit_values in criteria['regexp'].items():
|
||||||
for crit_re in crit_values:
|
for crit_re in crit_values:
|
||||||
matches[classtype] += 1 if crit_re.search(entry[crit_key]) else 0
|
matches[classtype] += 1 if crit_re.search(entry[crit_key]) else 0
|
||||||
|
|
||||||
|
|
||||||
matches = list(matches.items())
|
matches = list(matches.items())
|
||||||
matches.sort(key=lambda x:x[1])
|
matches.sort(key=lambda x:x[1])
|
||||||
|
|
||||||
matches_found = True if sum(map(lambda x:x[1], matches)) > 0 else False
|
matches_found = True if sum(map(lambda x:x[1], matches)) > 0 else False
|
||||||
|
|
||||||
entry['guessed_type'] = matches[-1][0] if matches_found else self.fields.TextField
|
entry['guessed_type'] = matches[-1][0] if matches_found else self.fields.TextField
|
||||||
yield entry
|
yield entry
|
||||||
|
|
||||||
"""
|
"""
|
||||||
"""
|
"""
|
||||||
def _convert_to_records(self, entries):
|
def _convert_to_records(self, entries):
|
||||||
|
@ -425,10 +432,10 @@ class PastedDefParser(RecordBuilder):
|
||||||
add(entry['name'].ljust(40))
|
add(entry['name'].ljust(40))
|
||||||
|
|
||||||
add(' = ')
|
add(' = ')
|
||||||
|
|
||||||
if entry['guessed_type']:
|
if entry['guessed_type']:
|
||||||
add(entry['guessed_type'].__name__)
|
add(entry['guessed_type'].__name__)
|
||||||
|
|
||||||
args = []
|
args = []
|
||||||
args.append("max_length=%d" % entry['length'])
|
args.append("max_length=%d" % entry['length'])
|
||||||
if entry['required'] != None:
|
if entry['required'] != None:
|
||||||
|
|
|
@ -53,17 +53,8 @@ class PDFRecordFinder(object):
|
||||||
for (i, row) in enumerate(self.textrows):
|
for (i, row) in enumerate(self.textrows):
|
||||||
match = self.heading_exp.match(row)
|
match = self.heading_exp.match(row)
|
||||||
if match:
|
if match:
|
||||||
#print i,match.groups()
|
|
||||||
results.append((i, ''.join(match.groups())))
|
results.append((i, ''.join(match.groups())))
|
||||||
|
|
||||||
"""
|
|
||||||
results2 = []
|
|
||||||
for r in results:
|
|
||||||
if len(results2)==0 or results2[-1:][0][1] != r[1]:
|
|
||||||
results2.append(r)
|
|
||||||
results = results2
|
|
||||||
"""
|
|
||||||
|
|
||||||
merged = []
|
merged = []
|
||||||
for (a, b) in zip(results, results[1:] + [(len(self.textrows),None)]):
|
for (a, b) in zip(results, results[1:] + [(len(self.textrows),None)]):
|
||||||
merged.append( (a[0], b[0]-1, a[1]) )
|
merged.append( (a[0], b[0]-1, a[1]) )
|
||||||
|
@ -84,6 +75,29 @@ class PDFRecordFinder(object):
|
||||||
|
|
||||||
def find_fields(self, row_iter):
|
def find_fields(self, row_iter):
|
||||||
cc = ColumnCollector()
|
cc = ColumnCollector()
|
||||||
|
blank_row_counter = 0
|
||||||
|
|
||||||
|
for r in row_iter:
|
||||||
|
row = self.extract_columns_from_row(r.decode('UTF-8'))
|
||||||
|
|
||||||
|
if not row:
|
||||||
|
cc.empty_row()
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
cc.add(row)
|
||||||
|
except IsNextField, e:
|
||||||
|
yield cc
|
||||||
|
cc = ColumnCollector()
|
||||||
|
cc.add(row)
|
||||||
|
except UnknownColumn, e:
|
||||||
|
raise StopIteration
|
||||||
|
|
||||||
|
yield cc
|
||||||
|
|
||||||
|
def find_fields_old(self, row_iter):
|
||||||
|
cc = ColumnCollector()
|
||||||
|
|
||||||
for r in row_iter:
|
for r in row_iter:
|
||||||
row = self.extract_columns_from_row(r.decode('UTF-8'))
|
row = self.extract_columns_from_row(r.decode('UTF-8'))
|
||||||
|
|
||||||
|
@ -151,9 +165,47 @@ class IsNextField(Exception):
|
||||||
class ColumnCollector(object):
|
class ColumnCollector(object):
|
||||||
def __init__(self, initial=None):
|
def __init__(self, initial=None):
|
||||||
self.data = None
|
self.data = None
|
||||||
|
self.column_widths = None
|
||||||
|
self.max_data_length = 0
|
||||||
|
self.adjust_pad = 3
|
||||||
|
self.empty_rows = 0
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return "<%s: %s>" % (self.__class__.__name__, map(lambda x:x if len(x) < 25 else x[:25] + '..', self.data.values()))
|
||||||
|
|
||||||
def add(self, data):
|
def add(self, data):
|
||||||
|
if self.empty_rows > 2:
|
||||||
|
raise IsNextField()
|
||||||
|
|
||||||
|
if not self.data:
|
||||||
|
self.data = dict(data)
|
||||||
|
else:
|
||||||
|
data = self.adjust_columns(data)
|
||||||
|
if self.is_next_field(data):
|
||||||
|
raise IsNextField()
|
||||||
|
for col_id, value in data:
|
||||||
|
self.merge_column(col_id, value)
|
||||||
|
|
||||||
|
self.update_column_widths(data)
|
||||||
|
|
||||||
|
def empty_row(self):
|
||||||
|
self.empty_rows += 1
|
||||||
|
|
||||||
|
def update_column_widths(self, data):
|
||||||
|
self.last_data_length = len(data)
|
||||||
|
self.max_data_length = max(self.max_data_length, len(data))
|
||||||
|
|
||||||
|
if not self.column_widths:
|
||||||
|
self.column_widths = dict(map(lambda (column, value): [column, column + len(value)], data))
|
||||||
|
else:
|
||||||
|
for col_id, value in data:
|
||||||
|
try:
|
||||||
|
self.column_widths[col_id] = max(self.column_widths[col_id], col_id + len(value.strip()))
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def add_old(self, data):
|
||||||
if not self.data:
|
if not self.data:
|
||||||
self.data = dict(data)
|
self.data = dict(data)
|
||||||
else:
|
else:
|
||||||
|
@ -162,10 +214,28 @@ class ColumnCollector(object):
|
||||||
for col_id, value in data:
|
for col_id, value in data:
|
||||||
self.merge_column(col_id, value)
|
self.merge_column(col_id, value)
|
||||||
|
|
||||||
|
|
||||||
|
def adjust_columns(self, data):
|
||||||
|
adjusted_data = {}
|
||||||
|
|
||||||
|
|
||||||
|
for col_id, value in data:
|
||||||
|
if col_id in self.data.keys():
|
||||||
|
adjusted_data[col_id] = value.strip()
|
||||||
|
else:
|
||||||
|
for col_start, col_end in self.column_widths.items():
|
||||||
|
if (col_start - self.adjust_pad) <= col_id and (col_end + self.adjust_pad) >= col_id:
|
||||||
|
if col_start in adjusted_data:
|
||||||
|
adjusted_data[col_start] += ' ' + value.strip()
|
||||||
|
else:
|
||||||
|
adjusted_data[col_start] = value.strip()
|
||||||
|
|
||||||
|
return adjusted_data.items()
|
||||||
|
|
||||||
|
|
||||||
def merge_column(self, col_id, value):
|
def merge_column(self, col_id, value):
|
||||||
if col_id in self.data.keys():
|
if col_id in self.data.keys():
|
||||||
self.data[col_id] += ' ' + value.strip()
|
self.data[col_id] += ' ' + value.strip()
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# try adding a wiggle room value?
|
# try adding a wiggle room value?
|
||||||
# FIXME:
|
# FIXME:
|
||||||
|
@ -175,6 +245,12 @@ class ColumnCollector(object):
|
||||||
# after the maximum column, and assume it's part of the
|
# after the maximum column, and assume it's part of the
|
||||||
# max column?
|
# max column?
|
||||||
|
|
||||||
|
"""
|
||||||
|
for col_start, col_end in self.column_widths.items():
|
||||||
|
if col_start <= col_id and (col_end) >= col_id:
|
||||||
|
self.data[col_start] += ' ' + value.strip()
|
||||||
|
return
|
||||||
|
"""
|
||||||
raise UnknownColumn
|
raise UnknownColumn
|
||||||
|
|
||||||
def is_next_field(self, data):
|
def is_next_field(self, data):
|
||||||
|
@ -185,13 +261,35 @@ class ColumnCollector(object):
|
||||||
the next field. Raise an exception and continue
|
the next field. Raise an exception and continue
|
||||||
on with a fresh ColumnCollector.
|
on with a fresh ColumnCollector.
|
||||||
"""
|
"""
|
||||||
first_key = dict(data).keys()[0]
|
|
||||||
if self.data:
|
""" If the length of the value in column_id is less than the position of the next column_id,
|
||||||
return self.data.keys()[0] == first_key
|
then this is probably a continuation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if self.data and data:
|
||||||
|
keys = dict(self.column_widths).keys()
|
||||||
|
keys.sort()
|
||||||
|
keys += [None]
|
||||||
|
|
||||||
|
if self.last_data_length < len(data):
|
||||||
|
return True
|
||||||
|
|
||||||
|
first_key, first_value = dict(data).items()[0]
|
||||||
|
if self.data.keys()[0] == first_key:
|
||||||
|
|
||||||
|
position = keys.index(first_key)
|
||||||
|
max_length = keys[position + 1]
|
||||||
|
if max_length:
|
||||||
|
return len(first_value) > max_length or len(data) == self.max_data_length
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def tuple(self):
|
def tuple(self):
|
||||||
return tuple(map(lambda k:self.data[k], sorted(self.data.keys())))
|
try:
|
||||||
|
return tuple(map(lambda k:self.data[k], sorted(self.data.keys())))
|
||||||
|
except:
|
||||||
|
import pdb
|
||||||
|
pdb.set_trace()
|
||||||
|
|
||||||
|
|
|
@ -32,11 +32,46 @@ doc = PDFRecordFinder(source_file)
|
||||||
records = doc.records()
|
records = doc.records()
|
||||||
builder = RecordBuilder()
|
builder = RecordBuilder()
|
||||||
|
|
||||||
|
def record_begins_at(field):
|
||||||
|
return int(fields[0].data.values()[0].split('-')[0], 10)
|
||||||
|
|
||||||
|
def record_ends_at(fields):
|
||||||
|
return int(fields[-1].data.values()[0].split('-')[-1], 10)
|
||||||
|
|
||||||
|
last_record_begins_at = -1
|
||||||
|
last_record_ends_at = -1
|
||||||
|
|
||||||
for rec in records:
|
for rec in records:
|
||||||
|
#if not rec[1]:
|
||||||
sys.stdout.write("class %s(object):\n" % re.sub('[^\w]','',rec[0]))
|
# continue # no actual fields detected
|
||||||
|
fields = rec[1]
|
||||||
|
|
||||||
|
# strip out fields that are not 4 items long
|
||||||
|
fields = filter(lambda x:len(x.tuple) == 4, fields)
|
||||||
|
|
||||||
|
# strip fields that don't begin at position 0
|
||||||
|
fields = filter(lambda x: 0 in x.data, fields)
|
||||||
|
|
||||||
|
# strip fields that don't have a length-range type item in position 0
|
||||||
|
fields = filter(lambda x: re.match('^\d+[-]?\d*$', x.data[0]), fields)
|
||||||
|
|
||||||
|
if not fields:
|
||||||
|
continue
|
||||||
|
|
||||||
|
begins_at = record_begins_at(fields)
|
||||||
|
ends_at = record_ends_at(fields)
|
||||||
|
|
||||||
|
# FIXME record_ends_at is randomly exploding due to record data being
|
||||||
|
# a lump of text and not necessarily a field entry. I assume
|
||||||
|
# this is cleaned out by the record builder class.
|
||||||
|
|
||||||
|
print last_record_ends_at + 1, begins_at
|
||||||
|
#if last_record_ends_at + 1 != begins_at:
|
||||||
|
sys.stdout.write("\nclass %s(object):\n" % re.sub('[^\w]','',rec[0]))
|
||||||
|
|
||||||
for field in builder.load(map(lambda x:x.tuple, rec[1][1:])):
|
for field in builder.load(map(lambda x:x.tuple, rec[1][1:])):
|
||||||
sys.stdout.write('\t' + field + '\n')
|
sys.stdout.write('\t' + field + '\n')
|
||||||
#print field
|
#print field
|
||||||
|
|
||||||
|
last_record_ends_at = ends_at
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue