diff --git a/pyaccuwage/parser.py b/pyaccuwage/parser.py index b06f6d4..1b475ca 100644 --- a/pyaccuwage/parser.py +++ b/pyaccuwage/parser.py @@ -39,7 +39,7 @@ class SimpleDefParser(object): else: item = self._intify(item) yield item - + class LengthExpression(object): import operator @@ -115,14 +115,16 @@ class NumericToken(BaseToken): @property def value(self): return int(self._value) - + class RecordBuilder(object): import fields + entry_max_length = 4 + TOKEN_TYPES = [ RangeToken, - NumericToken, + NumericToken, StringToken, ] @@ -134,7 +136,7 @@ class RecordBuilder(object): ], }, }), - + (fields.MoneyField, { 'regexp': { 'desc': [ @@ -144,7 +146,7 @@ class RecordBuilder(object): ], }, }), - + (fields.TextField, { 'regexp': { 'desc': [ @@ -171,13 +173,16 @@ class RecordBuilder(object): entries = self._guess_field_types(entries) entries = self._convert_to_records(entries) return entries - + def _compile(self, entries): for entry in entries: - - (f_range, f_name, f_length, f_desc) = list(entry) + ['']*(4-len(entry)) - + + if len(entry) > self.entry_max_length: + continue + + (f_range, f_name, f_length, f_desc) = list(entry) + ['']*(self.entry_max_length-len(entry)) + try: f_length = int(f_length) except ValueError, e: @@ -186,9 +191,11 @@ class RecordBuilder(object): try: assert f_length == RangeToken(f_range).value - except AssertionError: - import pdb - pdb.set_trace() + except AssertionError, e: + continue + except ValueError, e: + # bad result, skip + continue name_parts = f_name.split(' ') @@ -199,43 +206,45 @@ class RecordBuilder(object): required = True else: required = None - + f_name = u'_'.join(map(lambda x:x.lower(), name_parts)) + f_name = f_name.replace('&', 'and') f_name = re.sub(r'[^\w]','', f_name) - + yield { 'name': f_name, - 'desc': '(' + f_range + '). ' + f_desc, + 'range': f_range, + 'desc': f_desc, 'length': f_length, 'required': required, } - + def _guess_field_types(self, entries): lengthexp = LengthExpression() for entry in entries: matches = dict(map(lambda x:(x[0],0), self.FIELD_TYPES)) - + for (classtype, criteria) in self.FIELD_TYPES: if 'length' in criteria: if not lengthexp(int(entry['length']), criteria['length']): continue - + if 'regexp' in criteria: for crit_key, crit_values in criteria['regexp'].items(): for crit_re in crit_values: matches[classtype] += 1 if crit_re.search(entry[crit_key]) else 0 - - + + matches = list(matches.items()) matches.sort(key=lambda x:x[1]) - + matches_found = True if sum(map(lambda x:x[1], matches)) > 0 else False entry['guessed_type'] = matches[-1][0] if matches_found else self.fields.TextField yield entry - + def _convert_to_records(self, entries): blank_count = 1 for entry in entries: @@ -250,10 +259,10 @@ class RecordBuilder(object): add(entry['name'].ljust(40)) add(' = ') - + if entry['guessed_type']: add(entry['guessed_type'].__name__) - + args = [] args.append("max_length=%d" % entry['length']) if entry['required'] != None: @@ -261,13 +270,11 @@ class RecordBuilder(object): add("(" + ", ".join(args) + ")") - - yield "".join(result) - + yield "".join(result).ljust(85) + "# %s" % entry['range'] class PastedDefParser(RecordBuilder): - + def load(self, infile): tokens = self._tokenize(infile) entries = self._parse(tokens) @@ -275,7 +282,7 @@ class PastedDefParser(RecordBuilder): entries = self._guess_field_types(entries) entries = self._convert_to_records(entries) return entries - + def _tokenize(self, data): for item in data.replace('\n',' ').split(' '): item = item.strip() @@ -299,7 +306,7 @@ class PastedDefParser(RecordBuilder): current_length = None current_desc = [] state = 'range' - + byte_pos = None # COLLECT TOKENS INTO GROUPS @@ -311,7 +318,7 @@ class PastedDefParser(RecordBuilder): if byte_pos == None or token.value == byte_pos: # UPDATE RANGE POSITION byte_pos = token.value + 1 - + # CONVERT TOKEN INTO RangeToken token = RangeToken("%d-%d" % (token.value, token.value)) @@ -323,7 +330,7 @@ class PastedDefParser(RecordBuilder): #if byte_pos and token and state == 'desc' and token.start_position != byte_pos: # print token.start_position, byte_pos # current_desc.append(token) - + if token and byte_pos and token.start_position != byte_pos: state = 'desc' @@ -339,13 +346,13 @@ class PastedDefParser(RecordBuilder): # UPDATE RANGE POSITION if token: byte_pos = token.end_position + 1 - + current_range = token current_name = [] current_length = None current_desc = [] state = 'name' - + elif state == 'name': if isinstance(token, StringToken) and current_name and isinstance(current_name[-1], NumericToken): current_length = current_name.pop() @@ -361,9 +368,9 @@ class PastedDefParser(RecordBuilder): def _compile(self, groups): for g in groups: assert g['byterange'].value == g['length'].value - + desc = u' '.join(map(lambda x:unicode(x.value), g['desc'])) - + if g['name'][-1].value.lower() == '(optional)': g['name'] = g['name'][0:-1] required = False @@ -374,14 +381,14 @@ class PastedDefParser(RecordBuilder): name = u'_'.join(map(lambda x:x.value.lower(), g['name'])) name = re.sub(r'[^\w]','', name) - + yield({ 'name': name, 'desc': desc, 'length': g['byterange'].value, 'required': required, }) - + """ def _guess_field_types(self, entries): @@ -389,26 +396,26 @@ class PastedDefParser(RecordBuilder): for entry in entries: matches = dict(map(lambda x:(x[0],0), self.FIELD_TYPES)) - + for (classtype, criteria) in self.FIELD_TYPES: if 'length' in criteria: if not lengthexp(int(entry['length']), criteria['length']): continue - + if 'regexp' in criteria: for crit_key, crit_values in criteria['regexp'].items(): for crit_re in crit_values: matches[classtype] += 1 if crit_re.search(entry[crit_key]) else 0 - - + + matches = list(matches.items()) matches.sort(key=lambda x:x[1]) - + matches_found = True if sum(map(lambda x:x[1], matches)) > 0 else False entry['guessed_type'] = matches[-1][0] if matches_found else self.fields.TextField yield entry - + """ """ def _convert_to_records(self, entries): @@ -425,10 +432,10 @@ class PastedDefParser(RecordBuilder): add(entry['name'].ljust(40)) add(' = ') - + if entry['guessed_type']: add(entry['guessed_type'].__name__) - + args = [] args.append("max_length=%d" % entry['length']) if entry['required'] != None: diff --git a/pyaccuwage/pdfextract.py b/pyaccuwage/pdfextract.py index 8d149e0..d10e6cb 100644 --- a/pyaccuwage/pdfextract.py +++ b/pyaccuwage/pdfextract.py @@ -53,17 +53,8 @@ class PDFRecordFinder(object): for (i, row) in enumerate(self.textrows): match = self.heading_exp.match(row) if match: - #print i,match.groups() results.append((i, ''.join(match.groups()))) - """ - results2 = [] - for r in results: - if len(results2)==0 or results2[-1:][0][1] != r[1]: - results2.append(r) - results = results2 - """ - merged = [] for (a, b) in zip(results, results[1:] + [(len(self.textrows),None)]): merged.append( (a[0], b[0]-1, a[1]) ) @@ -84,6 +75,29 @@ class PDFRecordFinder(object): def find_fields(self, row_iter): cc = ColumnCollector() + blank_row_counter = 0 + + for r in row_iter: + row = self.extract_columns_from_row(r.decode('UTF-8')) + + if not row: + cc.empty_row() + continue + + try: + cc.add(row) + except IsNextField, e: + yield cc + cc = ColumnCollector() + cc.add(row) + except UnknownColumn, e: + raise StopIteration + + yield cc + + def find_fields_old(self, row_iter): + cc = ColumnCollector() + for r in row_iter: row = self.extract_columns_from_row(r.decode('UTF-8')) @@ -151,9 +165,47 @@ class IsNextField(Exception): class ColumnCollector(object): def __init__(self, initial=None): self.data = None + self.column_widths = None + self.max_data_length = 0 + self.adjust_pad = 3 + self.empty_rows = 0 pass + def __repr__(self): + return "<%s: %s>" % (self.__class__.__name__, map(lambda x:x if len(x) < 25 else x[:25] + '..', self.data.values())) + def add(self, data): + if self.empty_rows > 2: + raise IsNextField() + + if not self.data: + self.data = dict(data) + else: + data = self.adjust_columns(data) + if self.is_next_field(data): + raise IsNextField() + for col_id, value in data: + self.merge_column(col_id, value) + + self.update_column_widths(data) + + def empty_row(self): + self.empty_rows += 1 + + def update_column_widths(self, data): + self.last_data_length = len(data) + self.max_data_length = max(self.max_data_length, len(data)) + + if not self.column_widths: + self.column_widths = dict(map(lambda (column, value): [column, column + len(value)], data)) + else: + for col_id, value in data: + try: + self.column_widths[col_id] = max(self.column_widths[col_id], col_id + len(value.strip())) + except KeyError: + pass + + def add_old(self, data): if not self.data: self.data = dict(data) else: @@ -162,10 +214,28 @@ class ColumnCollector(object): for col_id, value in data: self.merge_column(col_id, value) + + def adjust_columns(self, data): + adjusted_data = {} + + + for col_id, value in data: + if col_id in self.data.keys(): + adjusted_data[col_id] = value.strip() + else: + for col_start, col_end in self.column_widths.items(): + if (col_start - self.adjust_pad) <= col_id and (col_end + self.adjust_pad) >= col_id: + if col_start in adjusted_data: + adjusted_data[col_start] += ' ' + value.strip() + else: + adjusted_data[col_start] = value.strip() + + return adjusted_data.items() + + def merge_column(self, col_id, value): if col_id in self.data.keys(): self.data[col_id] += ' ' + value.strip() - else: # try adding a wiggle room value? # FIXME: @@ -175,6 +245,12 @@ class ColumnCollector(object): # after the maximum column, and assume it's part of the # max column? + """ + for col_start, col_end in self.column_widths.items(): + if col_start <= col_id and (col_end) >= col_id: + self.data[col_start] += ' ' + value.strip() + return + """ raise UnknownColumn def is_next_field(self, data): @@ -185,13 +261,35 @@ class ColumnCollector(object): the next field. Raise an exception and continue on with a fresh ColumnCollector. """ - first_key = dict(data).keys()[0] - if self.data: - return self.data.keys()[0] == first_key + + """ If the length of the value in column_id is less than the position of the next column_id, + then this is probably a continuation. + """ + + if self.data and data: + keys = dict(self.column_widths).keys() + keys.sort() + keys += [None] + + if self.last_data_length < len(data): + return True + + first_key, first_value = dict(data).items()[0] + if self.data.keys()[0] == first_key: + + position = keys.index(first_key) + max_length = keys[position + 1] + if max_length: + return len(first_value) > max_length or len(data) == self.max_data_length + return False + @property def tuple(self): - return tuple(map(lambda k:self.data[k], sorted(self.data.keys()))) - + try: + return tuple(map(lambda k:self.data[k], sorted(self.data.keys()))) + except: + import pdb + pdb.set_trace() diff --git a/scripts/pyaccuwage-pdfparse b/scripts/pyaccuwage-pdfparse index f830f86..897ef53 100755 --- a/scripts/pyaccuwage-pdfparse +++ b/scripts/pyaccuwage-pdfparse @@ -32,11 +32,46 @@ doc = PDFRecordFinder(source_file) records = doc.records() builder = RecordBuilder() +def record_begins_at(field): + return int(fields[0].data.values()[0].split('-')[0], 10) + +def record_ends_at(fields): + return int(fields[-1].data.values()[0].split('-')[-1], 10) + +last_record_begins_at = -1 +last_record_ends_at = -1 + for rec in records: - - sys.stdout.write("class %s(object):\n" % re.sub('[^\w]','',rec[0])) - + #if not rec[1]: + # continue # no actual fields detected + fields = rec[1] + + # strip out fields that are not 4 items long + fields = filter(lambda x:len(x.tuple) == 4, fields) + + # strip fields that don't begin at position 0 + fields = filter(lambda x: 0 in x.data, fields) + + # strip fields that don't have a length-range type item in position 0 + fields = filter(lambda x: re.match('^\d+[-]?\d*$', x.data[0]), fields) + + if not fields: + continue + + begins_at = record_begins_at(fields) + ends_at = record_ends_at(fields) + + # FIXME record_ends_at is randomly exploding due to record data being + # a lump of text and not necessarily a field entry. I assume + # this is cleaned out by the record builder class. + + print last_record_ends_at + 1, begins_at + #if last_record_ends_at + 1 != begins_at: + sys.stdout.write("\nclass %s(object):\n" % re.sub('[^\w]','',rec[0])) + for field in builder.load(map(lambda x:x.tuple, rec[1][1:])): sys.stdout.write('\t' + field + '\n') #print field - + + last_record_ends_at = ends_at +