diff --git a/pyaccuwage/parser.py b/pyaccuwage/parser.py index b06f6d4..1b475ca 100644 --- a/pyaccuwage/parser.py +++ b/pyaccuwage/parser.py @@ -39,7 +39,7 @@ class SimpleDefParser(object): else: item = self._intify(item) yield item - + class LengthExpression(object): import operator @@ -115,14 +115,16 @@ class NumericToken(BaseToken): @property def value(self): return int(self._value) - + class RecordBuilder(object): import fields + entry_max_length = 4 + TOKEN_TYPES = [ RangeToken, - NumericToken, + NumericToken, StringToken, ] @@ -134,7 +136,7 @@ class RecordBuilder(object): ], }, }), - + (fields.MoneyField, { 'regexp': { 'desc': [ @@ -144,7 +146,7 @@ class RecordBuilder(object): ], }, }), - + (fields.TextField, { 'regexp': { 'desc': [ @@ -171,13 +173,16 @@ class RecordBuilder(object): entries = self._guess_field_types(entries) entries = self._convert_to_records(entries) return entries - + def _compile(self, entries): for entry in entries: - - (f_range, f_name, f_length, f_desc) = list(entry) + ['']*(4-len(entry)) - + + if len(entry) > self.entry_max_length: + continue + + (f_range, f_name, f_length, f_desc) = list(entry) + ['']*(self.entry_max_length-len(entry)) + try: f_length = int(f_length) except ValueError, e: @@ -186,9 +191,11 @@ class RecordBuilder(object): try: assert f_length == RangeToken(f_range).value - except AssertionError: - import pdb - pdb.set_trace() + except AssertionError, e: + continue + except ValueError, e: + # bad result, skip + continue name_parts = f_name.split(' ') @@ -199,43 +206,45 @@ class RecordBuilder(object): required = True else: required = None - + f_name = u'_'.join(map(lambda x:x.lower(), name_parts)) + f_name = f_name.replace('&', 'and') f_name = re.sub(r'[^\w]','', f_name) - + yield { 'name': f_name, - 'desc': '(' + f_range + '). ' + f_desc, + 'range': f_range, + 'desc': f_desc, 'length': f_length, 'required': required, } - + def _guess_field_types(self, entries): lengthexp = LengthExpression() for entry in entries: matches = dict(map(lambda x:(x[0],0), self.FIELD_TYPES)) - + for (classtype, criteria) in self.FIELD_TYPES: if 'length' in criteria: if not lengthexp(int(entry['length']), criteria['length']): continue - + if 'regexp' in criteria: for crit_key, crit_values in criteria['regexp'].items(): for crit_re in crit_values: matches[classtype] += 1 if crit_re.search(entry[crit_key]) else 0 - - + + matches = list(matches.items()) matches.sort(key=lambda x:x[1]) - + matches_found = True if sum(map(lambda x:x[1], matches)) > 0 else False entry['guessed_type'] = matches[-1][0] if matches_found else self.fields.TextField yield entry - + def _convert_to_records(self, entries): blank_count = 1 for entry in entries: @@ -250,10 +259,10 @@ class RecordBuilder(object): add(entry['name'].ljust(40)) add(' = ') - + if entry['guessed_type']: add(entry['guessed_type'].__name__) - + args = [] args.append("max_length=%d" % entry['length']) if entry['required'] != None: @@ -261,13 +270,11 @@ class RecordBuilder(object): add("(" + ", ".join(args) + ")") - - yield "".join(result) - + yield "".join(result).ljust(85) + "# %s" % entry['range'] class PastedDefParser(RecordBuilder): - + def load(self, infile): tokens = self._tokenize(infile) entries = self._parse(tokens) @@ -275,7 +282,7 @@ class PastedDefParser(RecordBuilder): entries = self._guess_field_types(entries) entries = self._convert_to_records(entries) return entries - + def _tokenize(self, data): for item in data.replace('\n',' ').split(' '): item = item.strip() @@ -299,7 +306,7 @@ class PastedDefParser(RecordBuilder): current_length = None current_desc = [] state = 'range' - + byte_pos = None # COLLECT TOKENS INTO GROUPS @@ -311,7 +318,7 @@ class PastedDefParser(RecordBuilder): if byte_pos == None or token.value == byte_pos: # UPDATE RANGE POSITION byte_pos = token.value + 1 - + # CONVERT TOKEN INTO RangeToken token = RangeToken("%d-%d" % (token.value, token.value)) @@ -323,7 +330,7 @@ class PastedDefParser(RecordBuilder): #if byte_pos and token and state == 'desc' and token.start_position != byte_pos: # print token.start_position, byte_pos # current_desc.append(token) - + if token and byte_pos and token.start_position != byte_pos: state = 'desc' @@ -339,13 +346,13 @@ class PastedDefParser(RecordBuilder): # UPDATE RANGE POSITION if token: byte_pos = token.end_position + 1 - + current_range = token current_name = [] current_length = None current_desc = [] state = 'name' - + elif state == 'name': if isinstance(token, StringToken) and current_name and isinstance(current_name[-1], NumericToken): current_length = current_name.pop() @@ -361,9 +368,9 @@ class PastedDefParser(RecordBuilder): def _compile(self, groups): for g in groups: assert g['byterange'].value == g['length'].value - + desc = u' '.join(map(lambda x:unicode(x.value), g['desc'])) - + if g['name'][-1].value.lower() == '(optional)': g['name'] = g['name'][0:-1] required = False @@ -374,14 +381,14 @@ class PastedDefParser(RecordBuilder): name = u'_'.join(map(lambda x:x.value.lower(), g['name'])) name = re.sub(r'[^\w]','', name) - + yield({ 'name': name, 'desc': desc, 'length': g['byterange'].value, 'required': required, }) - + """ def _guess_field_types(self, entries): @@ -389,26 +396,26 @@ class PastedDefParser(RecordBuilder): for entry in entries: matches = dict(map(lambda x:(x[0],0), self.FIELD_TYPES)) - + for (classtype, criteria) in self.FIELD_TYPES: if 'length' in criteria: if not lengthexp(int(entry['length']), criteria['length']): continue - + if 'regexp' in criteria: for crit_key, crit_values in criteria['regexp'].items(): for crit_re in crit_values: matches[classtype] += 1 if crit_re.search(entry[crit_key]) else 0 - - + + matches = list(matches.items()) matches.sort(key=lambda x:x[1]) - + matches_found = True if sum(map(lambda x:x[1], matches)) > 0 else False entry['guessed_type'] = matches[-1][0] if matches_found else self.fields.TextField yield entry - + """ """ def _convert_to_records(self, entries): @@ -425,10 +432,10 @@ class PastedDefParser(RecordBuilder): add(entry['name'].ljust(40)) add(' = ') - + if entry['guessed_type']: add(entry['guessed_type'].__name__) - + args = [] args.append("max_length=%d" % entry['length']) if entry['required'] != None: diff --git a/pyaccuwage/pdfextract.py b/pyaccuwage/pdfextract.py index 27d3019..3f6cdfd 100644 --- a/pyaccuwage/pdfextract.py +++ b/pyaccuwage/pdfextract.py @@ -33,14 +33,6 @@ class PDFRecordFinder(object): if match: results.append((i, ''.join(match.groups()))) - """ - results2 = [] - for r in results: - if len(results2)==0 or results2[-1:][0][1] != r[1]: - results2.append(r) - results = results2 - """ - merged = [] for (a, b) in zip(results, results[1:] + [(len(self.textrows),None)]): merged.append( (a[0], b[0]-1, a[1]) ) @@ -57,7 +49,6 @@ class PDFRecordFinder(object): if not row: continue - #if cc.is_next_field(row): # print len(cc.data) # yield cc @@ -102,8 +93,8 @@ class PDFRecordFinder(object): re_multiwhite = re.compile(r'\s{2,}') # IF LINE DOESN'T CONTAIN MULTIPLE WHITESPACES, IT'S LIKELY NOT A TABLE - #if not re_multiwhite.search(row): - # return None + if not re_multiwhite.search(row): + return None white_ranges = [0,] pos = 0 @@ -145,6 +136,7 @@ class ColumnCollector(object): self.data = None self.column_widths = None self.max_data_length = 0 + self.adjust_pad = 3 pass def add(self, data): @@ -190,11 +182,12 @@ class ColumnCollector(object): adjusted_data[col_id] = value.strip() else: for col_start, col_end in self.column_widths.items(): - if col_start <= col_id and (col_end) >= col_id: + if (col_start - self.adjust_pad) <= col_id and (col_end + self.adjust_pad) >= col_id: if col_start in adjusted_data: adjusted_data[col_start] += ' ' + value.strip() else: adjusted_data[col_start] = value.strip() + return adjusted_data.items() @@ -231,7 +224,7 @@ class ColumnCollector(object): then this is probably a continuation. """ - if self.data: + if self.data and data: keys = dict(self.column_widths).keys() keys.sort() keys += [None] @@ -244,20 +237,11 @@ class ColumnCollector(object): position = keys.index(first_key) max_length = keys[position + 1] - print 'test', len(first_value), max_length if max_length: return len(first_value) > max_length or len(data) == self.max_data_length return False - #for key, nextkey in map(lambda x:(keys[x], keys[x+1]), range(len(keys)-1)): - # print 'key', key, nextkey - - first_key, first_value = dict(data).items()[0] - if self.data: - #print self.data.keys()[0], first_key, first_value, self.column_widths - return self.data.keys()[0] == first_key # and len(first_value) > self.column_widths[first_key] - return False @property def tuple(self): diff --git a/scripts/pyaccuwage-pdfparse b/scripts/pyaccuwage-pdfparse index f830f86..7cb05cb 100755 --- a/scripts/pyaccuwage-pdfparse +++ b/scripts/pyaccuwage-pdfparse @@ -32,11 +32,23 @@ doc = PDFRecordFinder(source_file) records = doc.records() builder = RecordBuilder() +def record_begins_at(record): + return int(record[1][1].data.values()[0].split('-')[0], 10) + +def record_ends_at(record): + return record[1][-1].data + return int(record[1][-1].data.values()[0].split('-')[-1], 10) + for rec in records: - + + print record_begins_at(rec) #, 'to', record_ends_at(rec) + # FIXME record_ends_at is randomly exploding due to record data being + # a lump of text and not necessarily a field entry. I assume + # this is cleaned out by the record builder class. + sys.stdout.write("class %s(object):\n" % re.sub('[^\w]','',rec[0])) - + for field in builder.load(map(lambda x:x.tuple, rec[1][1:])): sys.stdout.write('\t' + field + '\n') #print field - +