From 730073dcd12c9b3fdc2b3f030432fc6bc672bc71 Mon Sep 17 00:00:00 2001 From: Binh Nguyen Date: Tue, 5 Feb 2013 15:43:04 -0600 Subject: [PATCH] working better! --- pyaccuwage/pdfextract.py | 51 ++++++++++++++-------------------------- 1 file changed, 18 insertions(+), 33 deletions(-) diff --git a/pyaccuwage/pdfextract.py b/pyaccuwage/pdfextract.py index ef9a023..42cc9dd 100644 --- a/pyaccuwage/pdfextract.py +++ b/pyaccuwage/pdfextract.py @@ -108,45 +108,29 @@ class PDFRecordFinder(object): blank_row_counter = 0 for r in row_iter: - row = self.extract_columns_from_row(r.decode('UTF-8')) - if not row: - cc.empty_row() + row = r.decode('UTF-8') + row_columns = self.extract_columns_from_row(row) + + if not row_columns: + if cc.data and len(cc.data.keys()) > 1 and len(row.strip()) > cc.data.keys()[-1]: + yield cc + cc = ColumnCollector() + else: + cc.empty_row() continue try: - cc.add(row) + cc.add(row_columns) + except IsNextField, e: yield cc cc = ColumnCollector() - cc.add(row) + cc.add(row_columns) except UnknownColumn, e: raise StopIteration yield cc - def find_fields_old(self, row_iter): - cc = ColumnCollector() - - for r in row_iter: - row = self.extract_columns_from_row(r.decode('UTF-8')) - - if not row: - continue - - if cc.is_next_field(row): - #if row[1][1] == 'Vendor Indicator': - # import pdb - # pdb.set_trace() - yield cc - cc = ColumnCollector() - - try: - cc.add(row) - - except UnknownColumn, e: - raise StopIteration - yield cc - def extract_columns_from_row(self, row): re_multiwhite = re.compile(r'\s{2,}') @@ -246,7 +230,6 @@ class ColumnCollector(object): def adjust_columns(self, data): adjusted_data = {} - for col_id, value in data: if col_id in self.data.keys(): adjusted_data[col_id] = value.strip() @@ -315,9 +298,11 @@ class ColumnCollector(object): @property def tuple(self): - try: + #try: + if self.data: return tuple(map(lambda k:self.data[k], sorted(self.data.keys()))) - except: - import pdb - pdb.set_trace() + return () + #except: + # import pdb + # pdb.set_trace()