improved record detection, state records are now found

2013-03-26 13:23:48 -05:00 · 2013-03-26 13:23:48 -05:00 · e8e57bb932
commit e8e57bb932
parent 8cf78b5336
1 changed files with 16 additions and 15 deletions
--- a/pyaccuwage/pdfextract.py
+++ b/pyaccuwage/pdfextract.py
@ -11,7 +11,7 @@ import pdb
 class PDFRecordFinder(object):
    def __init__(self, src, heading_exp=None):
        if not heading_exp:
-            heading_exp = re.compile('\s+Record Name: (.*)')
+            heading_exp = re.compile('(\s+Record Name: (.*))|Record\ Layout')
        field_heading_exp = re.compile('^Field.*Field.*Length.*Description')
@ -35,28 +35,29 @@ class PDFRecordFinder(object):
    def locate_heading_rows_by_field(self):
        results = []
        record_break = []
        line_is_whitespace_exp = re.compile('^(\s*)$')
        record_begin_exp = self.heading_exp #re.compile('Record\ Name')
        for (i, row) in enumerate(self.textrows):
            match = self.field_heading_exp.match(row)
            if match:
                # work backwards until we think the header is fully copied
                space_count_exp = re.compile('^(\s*)')
                position = i - 1
-                last_spaces = 10000
+                spaces = 0
                #last_spaces = 10000
                complete = False
                header = None
                while not complete:
-                    if len(self.textrows[position].strip()) == 0:
+                    line_is_whitespace = True if line_is_whitespace_exp.match(self.textrows[position]) else False
-                        spaces = 10000
+                    is_record_begin = record_begin_exp.search(self.textrows[position])
-                    else:
+                    if is_record_begin or line_is_whitespace:
-                        spaces = space_count_exp.search(self.textrows[position]).end()
+                        header = self.textrows[position-1:i]
                    if spaces > last_spaces:
                        header = self.textrows[position + 1:i]
                        complete = True
                    last_spaces = spaces
                    position -= 1
                name = ''.join(header).strip().decode('ascii','ignore')
                print (name, position)
                results.append((i, name, position))
            else:
                # See if this row forces us to break from field reading.
@ -81,6 +82,7 @@ class PDFRecordFinder(object):
            merged.append( (a[0], end_pos-1, a[1]) )
        return merged
    """
    def locate_heading_rows(self):
        results = []
        for (i, row) in enumerate(self.textrows):
@ -95,16 +97,15 @@ class PDFRecordFinder(object):
        return merged
    def locate_layout_block_rows(self):
-        """
+        # Search for rows that contain "Record Layout", as these are not fields
-        Search for rows that contain "Record Layout", as these are not fields
+        # we are interested in because they contain the crazy blocks of field definitions
-        we are interested in because they contain the crazy blocks of field definitions
+        # and not the nice 4-column ones that we're looking for.
        and not the nice 4-column ones that we're looking for."""
        results = []
        for (i, row) in enumerate(self.textrows):
            match = re.match("Record Layout", row)
-
+    """
    def find_fields(self, row_iter):
        cc = ColumnCollector()