Parsing all the way through the pdf appears to work. Next we need
to track the beginning/ending points for each record and append continuation records onto the previous. There's some issue in the pyaccuwage-pdfparse script causing it to have problems reading the last record field in a record group. Maybe the record extractor needs to dump the last failed ColumnCollector rather than return it if it's determined to hold junk data? The record builder seems to handle everything just fine. Added a function to the field name parsing to replace ampersands with an "and" string so as not to cause problems with variable names.
This commit is contained in:
parent
fe4bd20bad
commit
1c7533973a
3 changed files with 74 additions and 71 deletions
|
@ -39,7 +39,7 @@ class SimpleDefParser(object):
|
||||||
else:
|
else:
|
||||||
item = self._intify(item)
|
item = self._intify(item)
|
||||||
yield item
|
yield item
|
||||||
|
|
||||||
|
|
||||||
class LengthExpression(object):
|
class LengthExpression(object):
|
||||||
import operator
|
import operator
|
||||||
|
@ -115,14 +115,16 @@ class NumericToken(BaseToken):
|
||||||
@property
|
@property
|
||||||
def value(self):
|
def value(self):
|
||||||
return int(self._value)
|
return int(self._value)
|
||||||
|
|
||||||
|
|
||||||
class RecordBuilder(object):
|
class RecordBuilder(object):
|
||||||
import fields
|
import fields
|
||||||
|
|
||||||
|
entry_max_length = 4
|
||||||
|
|
||||||
TOKEN_TYPES = [
|
TOKEN_TYPES = [
|
||||||
RangeToken,
|
RangeToken,
|
||||||
NumericToken,
|
NumericToken,
|
||||||
StringToken,
|
StringToken,
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -134,7 +136,7 @@ class RecordBuilder(object):
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
}),
|
}),
|
||||||
|
|
||||||
(fields.MoneyField, {
|
(fields.MoneyField, {
|
||||||
'regexp': {
|
'regexp': {
|
||||||
'desc': [
|
'desc': [
|
||||||
|
@ -144,7 +146,7 @@ class RecordBuilder(object):
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
}),
|
}),
|
||||||
|
|
||||||
(fields.TextField, {
|
(fields.TextField, {
|
||||||
'regexp': {
|
'regexp': {
|
||||||
'desc': [
|
'desc': [
|
||||||
|
@ -171,13 +173,16 @@ class RecordBuilder(object):
|
||||||
entries = self._guess_field_types(entries)
|
entries = self._guess_field_types(entries)
|
||||||
entries = self._convert_to_records(entries)
|
entries = self._convert_to_records(entries)
|
||||||
return entries
|
return entries
|
||||||
|
|
||||||
|
|
||||||
def _compile(self, entries):
|
def _compile(self, entries):
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
|
|
||||||
(f_range, f_name, f_length, f_desc) = list(entry) + ['']*(4-len(entry))
|
if len(entry) > self.entry_max_length:
|
||||||
|
continue
|
||||||
|
|
||||||
|
(f_range, f_name, f_length, f_desc) = list(entry) + ['']*(self.entry_max_length-len(entry))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
f_length = int(f_length)
|
f_length = int(f_length)
|
||||||
except ValueError, e:
|
except ValueError, e:
|
||||||
|
@ -186,9 +191,11 @@ class RecordBuilder(object):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
assert f_length == RangeToken(f_range).value
|
assert f_length == RangeToken(f_range).value
|
||||||
except AssertionError:
|
except AssertionError, e:
|
||||||
import pdb
|
continue
|
||||||
pdb.set_trace()
|
except ValueError, e:
|
||||||
|
# bad result, skip
|
||||||
|
continue
|
||||||
|
|
||||||
name_parts = f_name.split(' ')
|
name_parts = f_name.split(' ')
|
||||||
|
|
||||||
|
@ -199,43 +206,45 @@ class RecordBuilder(object):
|
||||||
required = True
|
required = True
|
||||||
else:
|
else:
|
||||||
required = None
|
required = None
|
||||||
|
|
||||||
f_name = u'_'.join(map(lambda x:x.lower(), name_parts))
|
f_name = u'_'.join(map(lambda x:x.lower(), name_parts))
|
||||||
|
f_name = f_name.replace('&', 'and')
|
||||||
f_name = re.sub(r'[^\w]','', f_name)
|
f_name = re.sub(r'[^\w]','', f_name)
|
||||||
|
|
||||||
yield {
|
yield {
|
||||||
'name': f_name,
|
'name': f_name,
|
||||||
'desc': '(' + f_range + '). ' + f_desc,
|
'range': f_range,
|
||||||
|
'desc': f_desc,
|
||||||
'length': f_length,
|
'length': f_length,
|
||||||
'required': required,
|
'required': required,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def _guess_field_types(self, entries):
|
def _guess_field_types(self, entries):
|
||||||
lengthexp = LengthExpression()
|
lengthexp = LengthExpression()
|
||||||
|
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
matches = dict(map(lambda x:(x[0],0), self.FIELD_TYPES))
|
matches = dict(map(lambda x:(x[0],0), self.FIELD_TYPES))
|
||||||
|
|
||||||
for (classtype, criteria) in self.FIELD_TYPES:
|
for (classtype, criteria) in self.FIELD_TYPES:
|
||||||
if 'length' in criteria:
|
if 'length' in criteria:
|
||||||
if not lengthexp(int(entry['length']), criteria['length']):
|
if not lengthexp(int(entry['length']), criteria['length']):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if 'regexp' in criteria:
|
if 'regexp' in criteria:
|
||||||
for crit_key, crit_values in criteria['regexp'].items():
|
for crit_key, crit_values in criteria['regexp'].items():
|
||||||
for crit_re in crit_values:
|
for crit_re in crit_values:
|
||||||
matches[classtype] += 1 if crit_re.search(entry[crit_key]) else 0
|
matches[classtype] += 1 if crit_re.search(entry[crit_key]) else 0
|
||||||
|
|
||||||
|
|
||||||
matches = list(matches.items())
|
matches = list(matches.items())
|
||||||
matches.sort(key=lambda x:x[1])
|
matches.sort(key=lambda x:x[1])
|
||||||
|
|
||||||
matches_found = True if sum(map(lambda x:x[1], matches)) > 0 else False
|
matches_found = True if sum(map(lambda x:x[1], matches)) > 0 else False
|
||||||
|
|
||||||
entry['guessed_type'] = matches[-1][0] if matches_found else self.fields.TextField
|
entry['guessed_type'] = matches[-1][0] if matches_found else self.fields.TextField
|
||||||
yield entry
|
yield entry
|
||||||
|
|
||||||
def _convert_to_records(self, entries):
|
def _convert_to_records(self, entries):
|
||||||
blank_count = 1
|
blank_count = 1
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
|
@ -250,10 +259,10 @@ class RecordBuilder(object):
|
||||||
add(entry['name'].ljust(40))
|
add(entry['name'].ljust(40))
|
||||||
|
|
||||||
add(' = ')
|
add(' = ')
|
||||||
|
|
||||||
if entry['guessed_type']:
|
if entry['guessed_type']:
|
||||||
add(entry['guessed_type'].__name__)
|
add(entry['guessed_type'].__name__)
|
||||||
|
|
||||||
args = []
|
args = []
|
||||||
args.append("max_length=%d" % entry['length'])
|
args.append("max_length=%d" % entry['length'])
|
||||||
if entry['required'] != None:
|
if entry['required'] != None:
|
||||||
|
@ -261,13 +270,11 @@ class RecordBuilder(object):
|
||||||
|
|
||||||
add("(" + ", ".join(args) + ")")
|
add("(" + ", ".join(args) + ")")
|
||||||
|
|
||||||
|
yield "".join(result).ljust(85) + "# %s" % entry['range']
|
||||||
yield "".join(result)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class PastedDefParser(RecordBuilder):
|
class PastedDefParser(RecordBuilder):
|
||||||
|
|
||||||
def load(self, infile):
|
def load(self, infile):
|
||||||
tokens = self._tokenize(infile)
|
tokens = self._tokenize(infile)
|
||||||
entries = self._parse(tokens)
|
entries = self._parse(tokens)
|
||||||
|
@ -275,7 +282,7 @@ class PastedDefParser(RecordBuilder):
|
||||||
entries = self._guess_field_types(entries)
|
entries = self._guess_field_types(entries)
|
||||||
entries = self._convert_to_records(entries)
|
entries = self._convert_to_records(entries)
|
||||||
return entries
|
return entries
|
||||||
|
|
||||||
def _tokenize(self, data):
|
def _tokenize(self, data):
|
||||||
for item in data.replace('\n',' ').split(' '):
|
for item in data.replace('\n',' ').split(' '):
|
||||||
item = item.strip()
|
item = item.strip()
|
||||||
|
@ -299,7 +306,7 @@ class PastedDefParser(RecordBuilder):
|
||||||
current_length = None
|
current_length = None
|
||||||
current_desc = []
|
current_desc = []
|
||||||
state = 'range'
|
state = 'range'
|
||||||
|
|
||||||
byte_pos = None
|
byte_pos = None
|
||||||
|
|
||||||
# COLLECT TOKENS INTO GROUPS
|
# COLLECT TOKENS INTO GROUPS
|
||||||
|
@ -311,7 +318,7 @@ class PastedDefParser(RecordBuilder):
|
||||||
if byte_pos == None or token.value == byte_pos:
|
if byte_pos == None or token.value == byte_pos:
|
||||||
# UPDATE RANGE POSITION
|
# UPDATE RANGE POSITION
|
||||||
byte_pos = token.value + 1
|
byte_pos = token.value + 1
|
||||||
|
|
||||||
# CONVERT TOKEN INTO RangeToken
|
# CONVERT TOKEN INTO RangeToken
|
||||||
token = RangeToken("%d-%d" % (token.value, token.value))
|
token = RangeToken("%d-%d" % (token.value, token.value))
|
||||||
|
|
||||||
|
@ -323,7 +330,7 @@ class PastedDefParser(RecordBuilder):
|
||||||
#if byte_pos and token and state == 'desc' and token.start_position != byte_pos:
|
#if byte_pos and token and state == 'desc' and token.start_position != byte_pos:
|
||||||
# print token.start_position, byte_pos
|
# print token.start_position, byte_pos
|
||||||
# current_desc.append(token)
|
# current_desc.append(token)
|
||||||
|
|
||||||
if token and byte_pos and token.start_position != byte_pos:
|
if token and byte_pos and token.start_position != byte_pos:
|
||||||
state = 'desc'
|
state = 'desc'
|
||||||
|
|
||||||
|
@ -339,13 +346,13 @@ class PastedDefParser(RecordBuilder):
|
||||||
# UPDATE RANGE POSITION
|
# UPDATE RANGE POSITION
|
||||||
if token:
|
if token:
|
||||||
byte_pos = token.end_position + 1
|
byte_pos = token.end_position + 1
|
||||||
|
|
||||||
current_range = token
|
current_range = token
|
||||||
current_name = []
|
current_name = []
|
||||||
current_length = None
|
current_length = None
|
||||||
current_desc = []
|
current_desc = []
|
||||||
state = 'name'
|
state = 'name'
|
||||||
|
|
||||||
elif state == 'name':
|
elif state == 'name':
|
||||||
if isinstance(token, StringToken) and current_name and isinstance(current_name[-1], NumericToken):
|
if isinstance(token, StringToken) and current_name and isinstance(current_name[-1], NumericToken):
|
||||||
current_length = current_name.pop()
|
current_length = current_name.pop()
|
||||||
|
@ -361,9 +368,9 @@ class PastedDefParser(RecordBuilder):
|
||||||
def _compile(self, groups):
|
def _compile(self, groups):
|
||||||
for g in groups:
|
for g in groups:
|
||||||
assert g['byterange'].value == g['length'].value
|
assert g['byterange'].value == g['length'].value
|
||||||
|
|
||||||
desc = u' '.join(map(lambda x:unicode(x.value), g['desc']))
|
desc = u' '.join(map(lambda x:unicode(x.value), g['desc']))
|
||||||
|
|
||||||
if g['name'][-1].value.lower() == '(optional)':
|
if g['name'][-1].value.lower() == '(optional)':
|
||||||
g['name'] = g['name'][0:-1]
|
g['name'] = g['name'][0:-1]
|
||||||
required = False
|
required = False
|
||||||
|
@ -374,14 +381,14 @@ class PastedDefParser(RecordBuilder):
|
||||||
|
|
||||||
name = u'_'.join(map(lambda x:x.value.lower(), g['name']))
|
name = u'_'.join(map(lambda x:x.value.lower(), g['name']))
|
||||||
name = re.sub(r'[^\w]','', name)
|
name = re.sub(r'[^\w]','', name)
|
||||||
|
|
||||||
yield({
|
yield({
|
||||||
'name': name,
|
'name': name,
|
||||||
'desc': desc,
|
'desc': desc,
|
||||||
'length': g['byterange'].value,
|
'length': g['byterange'].value,
|
||||||
'required': required,
|
'required': required,
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def _guess_field_types(self, entries):
|
def _guess_field_types(self, entries):
|
||||||
|
@ -389,26 +396,26 @@ class PastedDefParser(RecordBuilder):
|
||||||
|
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
matches = dict(map(lambda x:(x[0],0), self.FIELD_TYPES))
|
matches = dict(map(lambda x:(x[0],0), self.FIELD_TYPES))
|
||||||
|
|
||||||
for (classtype, criteria) in self.FIELD_TYPES:
|
for (classtype, criteria) in self.FIELD_TYPES:
|
||||||
if 'length' in criteria:
|
if 'length' in criteria:
|
||||||
if not lengthexp(int(entry['length']), criteria['length']):
|
if not lengthexp(int(entry['length']), criteria['length']):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if 'regexp' in criteria:
|
if 'regexp' in criteria:
|
||||||
for crit_key, crit_values in criteria['regexp'].items():
|
for crit_key, crit_values in criteria['regexp'].items():
|
||||||
for crit_re in crit_values:
|
for crit_re in crit_values:
|
||||||
matches[classtype] += 1 if crit_re.search(entry[crit_key]) else 0
|
matches[classtype] += 1 if crit_re.search(entry[crit_key]) else 0
|
||||||
|
|
||||||
|
|
||||||
matches = list(matches.items())
|
matches = list(matches.items())
|
||||||
matches.sort(key=lambda x:x[1])
|
matches.sort(key=lambda x:x[1])
|
||||||
|
|
||||||
matches_found = True if sum(map(lambda x:x[1], matches)) > 0 else False
|
matches_found = True if sum(map(lambda x:x[1], matches)) > 0 else False
|
||||||
|
|
||||||
entry['guessed_type'] = matches[-1][0] if matches_found else self.fields.TextField
|
entry['guessed_type'] = matches[-1][0] if matches_found else self.fields.TextField
|
||||||
yield entry
|
yield entry
|
||||||
|
|
||||||
"""
|
"""
|
||||||
"""
|
"""
|
||||||
def _convert_to_records(self, entries):
|
def _convert_to_records(self, entries):
|
||||||
|
@ -425,10 +432,10 @@ class PastedDefParser(RecordBuilder):
|
||||||
add(entry['name'].ljust(40))
|
add(entry['name'].ljust(40))
|
||||||
|
|
||||||
add(' = ')
|
add(' = ')
|
||||||
|
|
||||||
if entry['guessed_type']:
|
if entry['guessed_type']:
|
||||||
add(entry['guessed_type'].__name__)
|
add(entry['guessed_type'].__name__)
|
||||||
|
|
||||||
args = []
|
args = []
|
||||||
args.append("max_length=%d" % entry['length'])
|
args.append("max_length=%d" % entry['length'])
|
||||||
if entry['required'] != None:
|
if entry['required'] != None:
|
||||||
|
|
|
@ -33,14 +33,6 @@ class PDFRecordFinder(object):
|
||||||
if match:
|
if match:
|
||||||
results.append((i, ''.join(match.groups())))
|
results.append((i, ''.join(match.groups())))
|
||||||
|
|
||||||
"""
|
|
||||||
results2 = []
|
|
||||||
for r in results:
|
|
||||||
if len(results2)==0 or results2[-1:][0][1] != r[1]:
|
|
||||||
results2.append(r)
|
|
||||||
results = results2
|
|
||||||
"""
|
|
||||||
|
|
||||||
merged = []
|
merged = []
|
||||||
for (a, b) in zip(results, results[1:] + [(len(self.textrows),None)]):
|
for (a, b) in zip(results, results[1:] + [(len(self.textrows),None)]):
|
||||||
merged.append( (a[0], b[0]-1, a[1]) )
|
merged.append( (a[0], b[0]-1, a[1]) )
|
||||||
|
@ -57,7 +49,6 @@ class PDFRecordFinder(object):
|
||||||
if not row:
|
if not row:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
||||||
#if cc.is_next_field(row):
|
#if cc.is_next_field(row):
|
||||||
# print len(cc.data)
|
# print len(cc.data)
|
||||||
# yield cc
|
# yield cc
|
||||||
|
@ -102,8 +93,8 @@ class PDFRecordFinder(object):
|
||||||
re_multiwhite = re.compile(r'\s{2,}')
|
re_multiwhite = re.compile(r'\s{2,}')
|
||||||
|
|
||||||
# IF LINE DOESN'T CONTAIN MULTIPLE WHITESPACES, IT'S LIKELY NOT A TABLE
|
# IF LINE DOESN'T CONTAIN MULTIPLE WHITESPACES, IT'S LIKELY NOT A TABLE
|
||||||
#if not re_multiwhite.search(row):
|
if not re_multiwhite.search(row):
|
||||||
# return None
|
return None
|
||||||
|
|
||||||
white_ranges = [0,]
|
white_ranges = [0,]
|
||||||
pos = 0
|
pos = 0
|
||||||
|
@ -145,6 +136,7 @@ class ColumnCollector(object):
|
||||||
self.data = None
|
self.data = None
|
||||||
self.column_widths = None
|
self.column_widths = None
|
||||||
self.max_data_length = 0
|
self.max_data_length = 0
|
||||||
|
self.adjust_pad = 3
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def add(self, data):
|
def add(self, data):
|
||||||
|
@ -190,11 +182,12 @@ class ColumnCollector(object):
|
||||||
adjusted_data[col_id] = value.strip()
|
adjusted_data[col_id] = value.strip()
|
||||||
else:
|
else:
|
||||||
for col_start, col_end in self.column_widths.items():
|
for col_start, col_end in self.column_widths.items():
|
||||||
if col_start <= col_id and (col_end) >= col_id:
|
if (col_start - self.adjust_pad) <= col_id and (col_end + self.adjust_pad) >= col_id:
|
||||||
if col_start in adjusted_data:
|
if col_start in adjusted_data:
|
||||||
adjusted_data[col_start] += ' ' + value.strip()
|
adjusted_data[col_start] += ' ' + value.strip()
|
||||||
else:
|
else:
|
||||||
adjusted_data[col_start] = value.strip()
|
adjusted_data[col_start] = value.strip()
|
||||||
|
|
||||||
return adjusted_data.items()
|
return adjusted_data.items()
|
||||||
|
|
||||||
|
|
||||||
|
@ -231,7 +224,7 @@ class ColumnCollector(object):
|
||||||
then this is probably a continuation.
|
then this is probably a continuation.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if self.data:
|
if self.data and data:
|
||||||
keys = dict(self.column_widths).keys()
|
keys = dict(self.column_widths).keys()
|
||||||
keys.sort()
|
keys.sort()
|
||||||
keys += [None]
|
keys += [None]
|
||||||
|
@ -244,20 +237,11 @@ class ColumnCollector(object):
|
||||||
|
|
||||||
position = keys.index(first_key)
|
position = keys.index(first_key)
|
||||||
max_length = keys[position + 1]
|
max_length = keys[position + 1]
|
||||||
print 'test', len(first_value), max_length
|
|
||||||
if max_length:
|
if max_length:
|
||||||
return len(first_value) > max_length or len(data) == self.max_data_length
|
return len(first_value) > max_length or len(data) == self.max_data_length
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
#for key, nextkey in map(lambda x:(keys[x], keys[x+1]), range(len(keys)-1)):
|
|
||||||
# print 'key', key, nextkey
|
|
||||||
|
|
||||||
first_key, first_value = dict(data).items()[0]
|
|
||||||
if self.data:
|
|
||||||
#print self.data.keys()[0], first_key, first_value, self.column_widths
|
|
||||||
return self.data.keys()[0] == first_key # and len(first_value) > self.column_widths[first_key]
|
|
||||||
return False
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def tuple(self):
|
def tuple(self):
|
||||||
|
|
|
@ -32,11 +32,23 @@ doc = PDFRecordFinder(source_file)
|
||||||
records = doc.records()
|
records = doc.records()
|
||||||
builder = RecordBuilder()
|
builder = RecordBuilder()
|
||||||
|
|
||||||
|
def record_begins_at(record):
|
||||||
|
return int(record[1][1].data.values()[0].split('-')[0], 10)
|
||||||
|
|
||||||
|
def record_ends_at(record):
|
||||||
|
return record[1][-1].data
|
||||||
|
return int(record[1][-1].data.values()[0].split('-')[-1], 10)
|
||||||
|
|
||||||
for rec in records:
|
for rec in records:
|
||||||
|
|
||||||
|
print record_begins_at(rec) #, 'to', record_ends_at(rec)
|
||||||
|
# FIXME record_ends_at is randomly exploding due to record data being
|
||||||
|
# a lump of text and not necessarily a field entry. I assume
|
||||||
|
# this is cleaned out by the record builder class.
|
||||||
|
|
||||||
sys.stdout.write("class %s(object):\n" % re.sub('[^\w]','',rec[0]))
|
sys.stdout.write("class %s(object):\n" % re.sub('[^\w]','',rec[0]))
|
||||||
|
|
||||||
for field in builder.load(map(lambda x:x.tuple, rec[1][1:])):
|
for field in builder.load(map(lambda x:x.tuple, rec[1][1:])):
|
||||||
sys.stdout.write('\t' + field + '\n')
|
sys.stdout.write('\t' + field + '\n')
|
||||||
#print field
|
#print field
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue