trying new header location method

2012-12-04 14:54:10 -06:00 · 2012-12-04 14:54:10 -06:00 · 6e1d02db8d
commit 6e1d02db8d
parent 46755dd90d
1 changed files with 53 additions and 20 deletions
--- a/pyaccuwage/pdfextract.py
+++ b/pyaccuwage/pdfextract.py
@ -11,30 +11,51 @@ import pdb
 class PDFRecordFinder(object):
    def __init__(self, src, heading_exp=None):
        if not heading_exp:
-            heading_exp = re.compile('\s+Record Name: (.*)')
+            heading_exp = re.compile('(\s+Record Name: (.*)|\s+(.*Record Layout.*)')
+
+        field_heading_exp = re.compile('^Field.*Field.*Length.*Descrition')

        opts = ["pdftotext", "-layout", "-nopgbrk", "-eol", "unix", src, '-']
        pdftext = subprocess.check_output(opts)
        self.textrows = pdftext.split('\n')
        self.heading_exp = heading_exp
-
+        self.field_heading_exp = field_heading_exp

    def records(self):
-        headings = self.locate_heading_rows()
+        #headings = self.locate_heading_rows()
+        headings = self.locate_heading_rows_by_field()
+
        for (start, end, name) in headings:
            name = name.decode('ascii', 'ignore')
            yield (name, list(self.find_fields(iter(self.textrows[start+1:end]))))


+    def locate_heading_rows_by_field(self):
+        results = []
+        for (i, row) in enumerate(self.textrows):
+            match = self.field_heading_exp.match(row)
+            if match:
+                # work backwards until we think the header is fully copied
+                space_count_exp = re.compile('^(\s*)')
+                position = i - 1
+                last_spaces = space_count_exp.search(self.textrows[position]
+                complete = False
+                while not complete:
+                    position -= 1
+                    spaces = space_count_exp.search(self.textrows[position])
+                    if spaces > last_spaces:
+                        print 'HEADER', self.textrows[position:i]
+                        complete = True
+
+
    def locate_heading_rows(self):
        results = []
        for (i, row) in enumerate(self.textrows):
            match = self.heading_exp.match(row)
            if match:
-                print i,match.groups()
-                #if not ''.join(match.groups()).lower().endswith('(continued)'):
+                #print i,match.groups()
                results.append((i, ''.join(match.groups())))
-        
+
        """
        results2 = []
        for r in results:
@ -49,25 +70,36 @@ class PDFRecordFinder(object):

        return merged

+    def locate_layout_block_rows(self):
+        """
+        Search for rows that contain "Record Layout", as these are not fields
+        we are interested in because they contain the crazy blocks of field definitions
+        and not the nice 4-column ones that we're looking for."""
+
+        results = []
+        for (i, row) in enumerate(self.textrows):
+            match = re.match("Record Layout", row)
+
+

    def find_fields(self, row_iter):
        cc = ColumnCollector()
        for r in row_iter:
            row = self.extract_columns_from_row(r.decode('UTF-8'))
-            
+
            if not row:
                continue
-            
+
            if cc.is_next_field(row):
-                if row[1][1] == 'Vendor Indicator':
-                    import pdb
-                    pdb.set_trace()
+                #if row[1][1] == 'Vendor Indicator':
+                #    import pdb
+                #    pdb.set_trace()
                yield cc
                cc = ColumnCollector()
-            
+
            try:
                cc.add(row)
-            
+
            except UnknownColumn, e:
                raise StopIteration
        yield cc
@ -102,13 +134,14 @@ class PDFRecordFinder(object):
                    row_result.append(
                        (start, row[start:end].encode('ascii','ignore'))
                    )
-                
+
            except StopIteration:
                white_iter = None

+        #print row_result
        return row_result
-         
-   
+
+
 class UnknownColumn(Exception):
    pass

@ -128,12 +161,12 @@ class ColumnCollector(object):
                raise IsNextField()
            for col_id, value in data:
                self.merge_column(col_id, value)
-                
+
    def merge_column(self, col_id, value):
        if col_id in self.data.keys():
            self.data[col_id] += ' ' + value.strip()
-            
-        else:        
+
+        else:
            # try adding a wiggle room value?
            # FIXME:
            # Sometimes description columns contain column-like
@ -143,7 +176,7 @@ class ColumnCollector(object):
            # max column?

            raise UnknownColumn
-    
+
    def is_next_field(self, data):
        """
        If the first key value contains a string