diff --git a/pyaccuwage/pdfextract.py b/pyaccuwage/pdfextract.py index 9b7c64a..27d3019 100644 --- a/pyaccuwage/pdfextract.py +++ b/pyaccuwage/pdfextract.py @@ -31,10 +31,8 @@ class PDFRecordFinder(object): for (i, row) in enumerate(self.textrows): match = self.heading_exp.match(row) if match: - print i,match.groups() - #if not ''.join(match.groups()).lower().endswith('(continued)'): results.append((i, ''.join(match.groups()))) - + """ results2 = [] for r in results: @@ -52,22 +50,49 @@ class PDFRecordFinder(object): def find_fields(self, row_iter): cc = ColumnCollector() + for r in row_iter: row = self.extract_columns_from_row(r.decode('UTF-8')) - + if not row: continue - - if cc.is_next_field(row): - if row[1][1] == 'Vendor Indicator': - import pdb - pdb.set_trace() - yield cc - cc = ColumnCollector() - + + + #if cc.is_next_field(row): + # print len(cc.data) + # yield cc + # cc = ColumnCollector() + try: cc.add(row) - + except IsNextField, e: + yield cc + cc = ColumnCollector() + cc.add(row) + except UnknownColumn, e: + raise StopIteration + + yield cc + + def find_fields_old(self, row_iter): + cc = ColumnCollector() + + for r in row_iter: + row = self.extract_columns_from_row(r.decode('UTF-8')) + + if not row: + continue + + if cc.is_next_field(row): + #if row[1][1] == 'Vendor Indicator': + # import pdb + # pdb.set_trace() + yield cc + cc = ColumnCollector() + + try: + cc.add(row) + except UnknownColumn, e: raise StopIteration yield cc @@ -77,8 +102,8 @@ class PDFRecordFinder(object): re_multiwhite = re.compile(r'\s{2,}') # IF LINE DOESN'T CONTAIN MULTIPLE WHITESPACES, IT'S LIKELY NOT A TABLE - if not re_multiwhite.search(row): - return None + #if not re_multiwhite.search(row): + # return None white_ranges = [0,] pos = 0 @@ -102,13 +127,13 @@ class PDFRecordFinder(object): row_result.append( (start, row[start:end].encode('ascii','ignore')) ) - + except StopIteration: white_iter = None return row_result - - + + class UnknownColumn(Exception): pass @@ -118,9 +143,36 @@ class IsNextField(Exception): class ColumnCollector(object): def __init__(self, initial=None): self.data = None + self.column_widths = None + self.max_data_length = 0 pass def add(self, data): + if not self.data: + self.data = dict(data) + else: + data = self.adjust_columns(data) + if self.is_next_field(data): + raise IsNextField() + for col_id, value in data: + self.merge_column(col_id, value) + + self.update_column_widths(data) + + def update_column_widths(self, data): + self.last_data_length = len(data) + self.max_data_length = max(self.max_data_length, len(data)) + + if not self.column_widths: + self.column_widths = dict(map(lambda (column, value): [column, column + len(value)], data)) + else: + for col_id, value in data: + try: + self.column_widths[col_id] = max(self.column_widths[col_id], col_id + len(value.strip())) + except KeyError: + pass + + def add_old(self, data): if not self.data: self.data = dict(data) else: @@ -128,12 +180,28 @@ class ColumnCollector(object): raise IsNextField() for col_id, value in data: self.merge_column(col_id, value) - + + + def adjust_columns(self, data): + adjusted_data = {} + + for col_id, value in data: + if col_id in self.data.keys(): + adjusted_data[col_id] = value.strip() + else: + for col_start, col_end in self.column_widths.items(): + if col_start <= col_id and (col_end) >= col_id: + if col_start in adjusted_data: + adjusted_data[col_start] += ' ' + value.strip() + else: + adjusted_data[col_start] = value.strip() + return adjusted_data.items() + + def merge_column(self, col_id, value): if col_id in self.data.keys(): self.data[col_id] += ' ' + value.strip() - - else: + else: # try adding a wiggle room value? # FIXME: # Sometimes description columns contain column-like @@ -142,8 +210,14 @@ class ColumnCollector(object): # after the maximum column, and assume it's part of the # max column? + """ + for col_start, col_end in self.column_widths.items(): + if col_start <= col_id and (col_end) >= col_id: + self.data[col_start] += ' ' + value.strip() + return + """ raise UnknownColumn - + def is_next_field(self, data): """ If the first key value contains a string @@ -152,9 +226,37 @@ class ColumnCollector(object): the next field. Raise an exception and continue on with a fresh ColumnCollector. """ - first_key = dict(data).keys()[0] + + """ If the length of the value in column_id is less than the position of the next column_id, + then this is probably a continuation. + """ + if self.data: - return self.data.keys()[0] == first_key + keys = dict(self.column_widths).keys() + keys.sort() + keys += [None] + + if self.last_data_length < len(data): + return True + + first_key, first_value = dict(data).items()[0] + if self.data.keys()[0] == first_key: + + position = keys.index(first_key) + max_length = keys[position + 1] + print 'test', len(first_value), max_length + if max_length: + return len(first_value) > max_length or len(data) == self.max_data_length + + return False + + #for key, nextkey in map(lambda x:(keys[x], keys[x+1]), range(len(keys)-1)): + # print 'key', key, nextkey + + first_key, first_value = dict(data).items()[0] + if self.data: + #print self.data.keys()[0], first_key, first_value, self.column_widths + return self.data.keys()[0] == first_key # and len(first_value) > self.column_widths[first_key] return False @property