From fe4bd20badd14fc44b37d1463311b6595ea86b92 Mon Sep 17 00:00:00 2001
From: Binh Nguyen <binh37@gmail.com>
Date: Tue, 6 Nov 2012 15:34:35 -0600
Subject: [PATCH] Record detection seems to be working much better. We
 currently have an issue where full-page width blocks are being interpreted as
 a single large column, and then subsequent field definition columns are being
 truncated in as subcolumns.

The current problematic line in p1220 is 1598.

Maybe add some functionality which lets us specify the number of
columns we're most interested in? Automatically discard 1-column
ColumnCollectors maybe?
---
 pyaccuwage/pdfextract.py | 150 ++++++++++++++++++++++++++++++++-------
 1 file changed, 126 insertions(+), 24 deletions(-)

diff --git a/pyaccuwage/pdfextract.py b/pyaccuwage/pdfextract.py
index 9b7c64a..27d3019 100644
--- a/pyaccuwage/pdfextract.py
+++ b/pyaccuwage/pdfextract.py
@@ -31,10 +31,8 @@ class PDFRecordFinder(object):
         for (i, row) in enumerate(self.textrows):
             match = self.heading_exp.match(row)
             if match:
-                print i,match.groups()
-                #if not ''.join(match.groups()).lower().endswith('(continued)'):
                 results.append((i, ''.join(match.groups())))
-        
+
         """
         results2 = []
         for r in results:
@@ -52,22 +50,49 @@ class PDFRecordFinder(object):
 
     def find_fields(self, row_iter):
         cc = ColumnCollector()
+
         for r in row_iter:
             row = self.extract_columns_from_row(r.decode('UTF-8'))
-            
+
             if not row:
                 continue
-            
-            if cc.is_next_field(row):
-                if row[1][1] == 'Vendor Indicator':
-                    import pdb
-                    pdb.set_trace()
-                yield cc
-                cc = ColumnCollector()
-            
+
+
+            #if cc.is_next_field(row):
+            #    print len(cc.data)
+            #    yield cc
+            #    cc = ColumnCollector()
+
             try:
                 cc.add(row)
-            
+            except IsNextField, e:
+                yield cc
+                cc = ColumnCollector()
+                cc.add(row)
+            except UnknownColumn, e:
+                raise StopIteration
+
+        yield cc
+
+    def find_fields_old(self, row_iter):
+        cc = ColumnCollector()
+
+        for r in row_iter:
+            row = self.extract_columns_from_row(r.decode('UTF-8'))
+
+            if not row:
+                continue
+
+            if cc.is_next_field(row):
+                #if row[1][1] == 'Vendor Indicator':
+                #    import pdb
+                #    pdb.set_trace()
+                yield cc
+                cc = ColumnCollector()
+
+            try:
+                cc.add(row)
+
             except UnknownColumn, e:
                 raise StopIteration
         yield cc
@@ -77,8 +102,8 @@ class PDFRecordFinder(object):
         re_multiwhite = re.compile(r'\s{2,}')
 
         # IF LINE DOESN'T CONTAIN MULTIPLE WHITESPACES, IT'S LIKELY NOT A TABLE
-        if not re_multiwhite.search(row):
-            return None
+        #if not re_multiwhite.search(row):
+        #    return None
 
         white_ranges = [0,]
         pos = 0
@@ -102,13 +127,13 @@ class PDFRecordFinder(object):
                     row_result.append(
                         (start, row[start:end].encode('ascii','ignore'))
                     )
-                
+
             except StopIteration:
                 white_iter = None
 
         return row_result
-         
-   
+
+
 class UnknownColumn(Exception):
     pass
 
@@ -118,9 +143,36 @@ class IsNextField(Exception):
 class ColumnCollector(object):
     def __init__(self, initial=None):
         self.data = None
+        self.column_widths = None
+        self.max_data_length = 0
         pass
 
     def add(self, data):
+        if not self.data:
+            self.data = dict(data)
+        else:
+            data = self.adjust_columns(data)
+            if self.is_next_field(data):
+                raise IsNextField()
+            for col_id, value in data:
+                self.merge_column(col_id, value)
+
+        self.update_column_widths(data)
+
+    def update_column_widths(self, data):
+        self.last_data_length = len(data)
+        self.max_data_length = max(self.max_data_length, len(data))
+
+        if not self.column_widths:
+            self.column_widths = dict(map(lambda (column, value): [column, column + len(value)], data))
+        else:
+            for col_id, value in data:
+                try:
+                    self.column_widths[col_id] = max(self.column_widths[col_id], col_id + len(value.strip()))
+                except KeyError:
+                    pass
+
+    def add_old(self, data):
         if not self.data:
             self.data = dict(data)
         else:
@@ -128,12 +180,28 @@ class ColumnCollector(object):
                 raise IsNextField()
             for col_id, value in data:
                 self.merge_column(col_id, value)
-                
+
+
+    def adjust_columns(self, data):
+        adjusted_data = {}
+
+        for col_id, value in data:
+            if col_id in self.data.keys():
+                adjusted_data[col_id] = value.strip()
+            else:
+                for col_start, col_end in self.column_widths.items():
+                    if col_start <= col_id and (col_end) >= col_id:
+                        if col_start in adjusted_data:
+                            adjusted_data[col_start] += ' ' + value.strip()
+                        else:
+                            adjusted_data[col_start] = value.strip()
+        return adjusted_data.items()
+
+
     def merge_column(self, col_id, value):
         if col_id in self.data.keys():
             self.data[col_id] += ' ' + value.strip()
-            
-        else:        
+        else:
             # try adding a wiggle room value?
             # FIXME:
             # Sometimes description columns contain column-like
@@ -142,8 +210,14 @@ class ColumnCollector(object):
             # after the maximum column, and assume it's part of the
             # max column?
 
+            """
+            for col_start, col_end in self.column_widths.items():
+                if col_start <= col_id and (col_end) >= col_id:
+                    self.data[col_start] += ' ' + value.strip()
+                    return
+            """
             raise UnknownColumn
-    
+
     def is_next_field(self, data):
         """
         If the first key value contains a string
@@ -152,9 +226,37 @@ class ColumnCollector(object):
         the next field. Raise an exception and continue
         on with a fresh ColumnCollector.
         """
-        first_key = dict(data).keys()[0]
+
+        """ If the length of the value in column_id is less than the position of the next column_id,
+            then this is probably a continuation.
+        """
+
         if self.data:
-            return self.data.keys()[0] == first_key
+            keys = dict(self.column_widths).keys()
+            keys.sort()
+            keys += [None]
+
+            if self.last_data_length < len(data):
+                return True
+
+            first_key, first_value = dict(data).items()[0]
+            if self.data.keys()[0] == first_key:
+
+                position = keys.index(first_key)
+                max_length = keys[position + 1]
+                print 'test', len(first_value), max_length
+                if max_length:
+                    return len(first_value) > max_length or len(data) == self.max_data_length
+
+        return False
+
+            #for key, nextkey in map(lambda x:(keys[x], keys[x+1]), range(len(keys)-1)):
+            #    print 'key', key, nextkey
+
+        first_key, first_value = dict(data).items()[0]
+        if self.data:
+            #print self.data.keys()[0], first_key, first_value, self.column_widths
+            return self.data.keys()[0] == first_key # and len(first_value) > self.column_widths[first_key]
         return False
 
     @property