trying new header location method

2012-12-04 14:54:10 -06:00 · 2012-12-04 14:54:10 -06:00 · 6e1d02db8d
commit 6e1d02db8d
parent 46755dd90d
1 changed files with 53 additions and 20 deletions
--- a/pyaccuwage/pdfextract.py
+++ b/pyaccuwage/pdfextract.py
@ -11,28 +11,49 @@ import pdb
 class PDFRecordFinder(object):
    def __init__(self, src, heading_exp=None):
        if not heading_exp:
-            heading_exp = re.compile('\s+Record Name: (.*)')
+            heading_exp = re.compile('(\s+Record Name: (.*)|\s+(.*Record Layout.*)')
+
+        field_heading_exp = re.compile('^Field.*Field.*Length.*Descrition')

        opts = ["pdftotext", "-layout", "-nopgbrk", "-eol", "unix", src, '-']
        pdftext = subprocess.check_output(opts)
        self.textrows = pdftext.split('\n')
        self.heading_exp = heading_exp
-
+        self.field_heading_exp = field_heading_exp

    def records(self):
-        headings = self.locate_heading_rows()
+        #headings = self.locate_heading_rows()
+        headings = self.locate_heading_rows_by_field()
+
        for (start, end, name) in headings:
            name = name.decode('ascii', 'ignore')
            yield (name, list(self.find_fields(iter(self.textrows[start+1:end]))))


+    def locate_heading_rows_by_field(self):
+        results = []
+        for (i, row) in enumerate(self.textrows):
+            match = self.field_heading_exp.match(row)
+            if match:
+                # work backwards until we think the header is fully copied
+                space_count_exp = re.compile('^(\s*)')
+                position = i - 1
+                last_spaces = space_count_exp.search(self.textrows[position]
+                complete = False
+                while not complete:
+                    position -= 1
+                    spaces = space_count_exp.search(self.textrows[position])
+                    if spaces > last_spaces:
+                        print 'HEADER', self.textrows[position:i]
+                        complete = True
+
+
    def locate_heading_rows(self):
        results = []
        for (i, row) in enumerate(self.textrows):
            match = self.heading_exp.match(row)
            if match:
-                print i,match.groups()
-                #if not ''.join(match.groups()).lower().endswith('(continued)'):
+                #print i,match.groups()
                results.append((i, ''.join(match.groups())))

        """
@ -49,6 +70,17 @@ class PDFRecordFinder(object):

        return merged

+    def locate_layout_block_rows(self):
+        """
+        Search for rows that contain "Record Layout", as these are not fields
+        we are interested in because they contain the crazy blocks of field definitions
+        and not the nice 4-column ones that we're looking for."""
+
+        results = []
+        for (i, row) in enumerate(self.textrows):
+            match = re.match("Record Layout", row)
+
+

    def find_fields(self, row_iter):
        cc = ColumnCollector()
@ -59,9 +91,9 @@ class PDFRecordFinder(object):
                continue

            if cc.is_next_field(row):
-                if row[1][1] == 'Vendor Indicator':
-                    import pdb
-                    pdb.set_trace()
+                #if row[1][1] == 'Vendor Indicator':
+                #    import pdb
+                #    pdb.set_trace()
                yield cc
                cc = ColumnCollector()

@ -106,6 +138,7 @@ class PDFRecordFinder(object):
            except StopIteration:
                white_iter = None

+        #print row_result
        return row_result