From 1c7533973a61cf3be7ccdb2b607625fe8addd9d3 Mon Sep 17 00:00:00 2001 From: Binh Nguyen Date: Tue, 13 Nov 2012 15:53:41 -0600 Subject: [PATCH] Parsing all the way through the pdf appears to work. Next we need to track the beginning/ending points for each record and append continuation records onto the previous. There's some issue in the pyaccuwage-pdfparse script causing it to have problems reading the last record field in a record group. Maybe the record extractor needs to dump the last failed ColumnCollector rather than return it if it's determined to hold junk data? The record builder seems to handle everything just fine. Added a function to the field name parsing to replace ampersands with an "and" string so as not to cause problems with variable names. --- pyaccuwage/parser.py | 99 ++++++++++++++++++++----------------- pyaccuwage/pdfextract.py | 28 +++-------- scripts/pyaccuwage-pdfparse | 18 +++++-- 3 files changed, 74 insertions(+), 71 deletions(-) diff --git a/pyaccuwage/parser.py b/pyaccuwage/parser.py index b06f6d4..1b475ca 100644 --- a/pyaccuwage/parser.py +++ b/pyaccuwage/parser.py @@ -39,7 +39,7 @@ class SimpleDefParser(object): else: item = self._intify(item) yield item - + class LengthExpression(object): import operator @@ -115,14 +115,16 @@ class NumericToken(BaseToken): @property def value(self): return int(self._value) - + class RecordBuilder(object): import fields + entry_max_length = 4 + TOKEN_TYPES = [ RangeToken, - NumericToken, + NumericToken, StringToken, ] @@ -134,7 +136,7 @@ class RecordBuilder(object): ], }, }), - + (fields.MoneyField, { 'regexp': { 'desc': [ @@ -144,7 +146,7 @@ class RecordBuilder(object): ], }, }), - + (fields.TextField, { 'regexp': { 'desc': [ @@ -171,13 +173,16 @@ class RecordBuilder(object): entries = self._guess_field_types(entries) entries = self._convert_to_records(entries) return entries - + def _compile(self, entries): for entry in entries: - - (f_range, f_name, f_length, f_desc) = list(entry) + ['']*(4-len(entry)) - + + if len(entry) > self.entry_max_length: + continue + + (f_range, f_name, f_length, f_desc) = list(entry) + ['']*(self.entry_max_length-len(entry)) + try: f_length = int(f_length) except ValueError, e: @@ -186,9 +191,11 @@ class RecordBuilder(object): try: assert f_length == RangeToken(f_range).value - except AssertionError: - import pdb - pdb.set_trace() + except AssertionError, e: + continue + except ValueError, e: + # bad result, skip + continue name_parts = f_name.split(' ') @@ -199,43 +206,45 @@ class RecordBuilder(object): required = True else: required = None - + f_name = u'_'.join(map(lambda x:x.lower(), name_parts)) + f_name = f_name.replace('&', 'and') f_name = re.sub(r'[^\w]','', f_name) - + yield { 'name': f_name, - 'desc': '(' + f_range + '). ' + f_desc, + 'range': f_range, + 'desc': f_desc, 'length': f_length, 'required': required, } - + def _guess_field_types(self, entries): lengthexp = LengthExpression() for entry in entries: matches = dict(map(lambda x:(x[0],0), self.FIELD_TYPES)) - + for (classtype, criteria) in self.FIELD_TYPES: if 'length' in criteria: if not lengthexp(int(entry['length']), criteria['length']): continue - + if 'regexp' in criteria: for crit_key, crit_values in criteria['regexp'].items(): for crit_re in crit_values: matches[classtype] += 1 if crit_re.search(entry[crit_key]) else 0 - - + + matches = list(matches.items()) matches.sort(key=lambda x:x[1]) - + matches_found = True if sum(map(lambda x:x[1], matches)) > 0 else False entry['guessed_type'] = matches[-1][0] if matches_found else self.fields.TextField yield entry - + def _convert_to_records(self, entries): blank_count = 1 for entry in entries: @@ -250,10 +259,10 @@ class RecordBuilder(object): add(entry['name'].ljust(40)) add(' = ') - + if entry['guessed_type']: add(entry['guessed_type'].__name__) - + args = [] args.append("max_length=%d" % entry['length']) if entry['required'] != None: @@ -261,13 +270,11 @@ class RecordBuilder(object): add("(" + ", ".join(args) + ")") - - yield "".join(result) - + yield "".join(result).ljust(85) + "# %s" % entry['range'] class PastedDefParser(RecordBuilder): - + def load(self, infile): tokens = self._tokenize(infile) entries = self._parse(tokens) @@ -275,7 +282,7 @@ class PastedDefParser(RecordBuilder): entries = self._guess_field_types(entries) entries = self._convert_to_records(entries) return entries - + def _tokenize(self, data): for item in data.replace('\n',' ').split(' '): item = item.strip() @@ -299,7 +306,7 @@ class PastedDefParser(RecordBuilder): current_length = None current_desc = [] state = 'range' - + byte_pos = None # COLLECT TOKENS INTO GROUPS @@ -311,7 +318,7 @@ class PastedDefParser(RecordBuilder): if byte_pos == None or token.value == byte_pos: # UPDATE RANGE POSITION byte_pos = token.value + 1 - + # CONVERT TOKEN INTO RangeToken token = RangeToken("%d-%d" % (token.value, token.value)) @@ -323,7 +330,7 @@ class PastedDefParser(RecordBuilder): #if byte_pos and token and state == 'desc' and token.start_position != byte_pos: # print token.start_position, byte_pos # current_desc.append(token) - + if token and byte_pos and token.start_position != byte_pos: state = 'desc' @@ -339,13 +346,13 @@ class PastedDefParser(RecordBuilder): # UPDATE RANGE POSITION if token: byte_pos = token.end_position + 1 - + current_range = token current_name = [] current_length = None current_desc = [] state = 'name' - + elif state == 'name': if isinstance(token, StringToken) and current_name and isinstance(current_name[-1], NumericToken): current_length = current_name.pop() @@ -361,9 +368,9 @@ class PastedDefParser(RecordBuilder): def _compile(self, groups): for g in groups: assert g['byterange'].value == g['length'].value - + desc = u' '.join(map(lambda x:unicode(x.value), g['desc'])) - + if g['name'][-1].value.lower() == '(optional)': g['name'] = g['name'][0:-1] required = False @@ -374,14 +381,14 @@ class PastedDefParser(RecordBuilder): name = u'_'.join(map(lambda x:x.value.lower(), g['name'])) name = re.sub(r'[^\w]','', name) - + yield({ 'name': name, 'desc': desc, 'length': g['byterange'].value, 'required': required, }) - + """ def _guess_field_types(self, entries): @@ -389,26 +396,26 @@ class PastedDefParser(RecordBuilder): for entry in entries: matches = dict(map(lambda x:(x[0],0), self.FIELD_TYPES)) - + for (classtype, criteria) in self.FIELD_TYPES: if 'length' in criteria: if not lengthexp(int(entry['length']), criteria['length']): continue - + if 'regexp' in criteria: for crit_key, crit_values in criteria['regexp'].items(): for crit_re in crit_values: matches[classtype] += 1 if crit_re.search(entry[crit_key]) else 0 - - + + matches = list(matches.items()) matches.sort(key=lambda x:x[1]) - + matches_found = True if sum(map(lambda x:x[1], matches)) > 0 else False entry['guessed_type'] = matches[-1][0] if matches_found else self.fields.TextField yield entry - + """ """ def _convert_to_records(self, entries): @@ -425,10 +432,10 @@ class PastedDefParser(RecordBuilder): add(entry['name'].ljust(40)) add(' = ') - + if entry['guessed_type']: add(entry['guessed_type'].__name__) - + args = [] args.append("max_length=%d" % entry['length']) if entry['required'] != None: diff --git a/pyaccuwage/pdfextract.py b/pyaccuwage/pdfextract.py index 27d3019..3f6cdfd 100644 --- a/pyaccuwage/pdfextract.py +++ b/pyaccuwage/pdfextract.py @@ -33,14 +33,6 @@ class PDFRecordFinder(object): if match: results.append((i, ''.join(match.groups()))) - """ - results2 = [] - for r in results: - if len(results2)==0 or results2[-1:][0][1] != r[1]: - results2.append(r) - results = results2 - """ - merged = [] for (a, b) in zip(results, results[1:] + [(len(self.textrows),None)]): merged.append( (a[0], b[0]-1, a[1]) ) @@ -57,7 +49,6 @@ class PDFRecordFinder(object): if not row: continue - #if cc.is_next_field(row): # print len(cc.data) # yield cc @@ -102,8 +93,8 @@ class PDFRecordFinder(object): re_multiwhite = re.compile(r'\s{2,}') # IF LINE DOESN'T CONTAIN MULTIPLE WHITESPACES, IT'S LIKELY NOT A TABLE - #if not re_multiwhite.search(row): - # return None + if not re_multiwhite.search(row): + return None white_ranges = [0,] pos = 0 @@ -145,6 +136,7 @@ class ColumnCollector(object): self.data = None self.column_widths = None self.max_data_length = 0 + self.adjust_pad = 3 pass def add(self, data): @@ -190,11 +182,12 @@ class ColumnCollector(object): adjusted_data[col_id] = value.strip() else: for col_start, col_end in self.column_widths.items(): - if col_start <= col_id and (col_end) >= col_id: + if (col_start - self.adjust_pad) <= col_id and (col_end + self.adjust_pad) >= col_id: if col_start in adjusted_data: adjusted_data[col_start] += ' ' + value.strip() else: adjusted_data[col_start] = value.strip() + return adjusted_data.items() @@ -231,7 +224,7 @@ class ColumnCollector(object): then this is probably a continuation. """ - if self.data: + if self.data and data: keys = dict(self.column_widths).keys() keys.sort() keys += [None] @@ -244,20 +237,11 @@ class ColumnCollector(object): position = keys.index(first_key) max_length = keys[position + 1] - print 'test', len(first_value), max_length if max_length: return len(first_value) > max_length or len(data) == self.max_data_length return False - #for key, nextkey in map(lambda x:(keys[x], keys[x+1]), range(len(keys)-1)): - # print 'key', key, nextkey - - first_key, first_value = dict(data).items()[0] - if self.data: - #print self.data.keys()[0], first_key, first_value, self.column_widths - return self.data.keys()[0] == first_key # and len(first_value) > self.column_widths[first_key] - return False @property def tuple(self): diff --git a/scripts/pyaccuwage-pdfparse b/scripts/pyaccuwage-pdfparse index f830f86..7cb05cb 100755 --- a/scripts/pyaccuwage-pdfparse +++ b/scripts/pyaccuwage-pdfparse @@ -32,11 +32,23 @@ doc = PDFRecordFinder(source_file) records = doc.records() builder = RecordBuilder() +def record_begins_at(record): + return int(record[1][1].data.values()[0].split('-')[0], 10) + +def record_ends_at(record): + return record[1][-1].data + return int(record[1][-1].data.values()[0].split('-')[-1], 10) + for rec in records: - + + print record_begins_at(rec) #, 'to', record_ends_at(rec) + # FIXME record_ends_at is randomly exploding due to record data being + # a lump of text and not necessarily a field entry. I assume + # this is cleaned out by the record builder class. + sys.stdout.write("class %s(object):\n" % re.sub('[^\w]','',rec[0])) - + for field in builder.load(map(lambda x:x.tuple, rec[1][1:])): sys.stdout.write('\t' + field + '\n') #print field - +