From 6e4a975cfb734db5a8cff0494bee04104d2ba879 Mon Sep 17 00:00:00 2001
From: Binh Nguyen <binh37@gmail.com>
Date: Tue, 4 Dec 2012 16:04:08 -0600
Subject: [PATCH] Changed the way records are found by searching for field
 headers and then working backwards to determine the record name. We also
 added the ability to "break" from reading a series of field definitions based
 on certain break points such as "Record Layout". There is currently an error
 in p1220 line 2704 which is caused by the column data starting on the 4th
 column "Description and Remarks".

If ColumnCollectors started with the field titles, and had awareness of the column
positions starting with those, it may be possible to at least read the following
record fields without auto-adjusting them.
---
 pyaccuwage/pdfextract.py | 53 +++++++++++++++++++++++++++++++---------
 1 file changed, 42 insertions(+), 11 deletions(-)

diff --git a/pyaccuwage/pdfextract.py b/pyaccuwage/pdfextract.py
index d10e6cb..12c99e6 100644
--- a/pyaccuwage/pdfextract.py
+++ b/pyaccuwage/pdfextract.py
@@ -11,9 +11,9 @@ import pdb
 class PDFRecordFinder(object):
     def __init__(self, src, heading_exp=None):
         if not heading_exp:
-            heading_exp = re.compile('(\s+Record Name: (.*)|\s+(.*Record Layout.*)')
+            heading_exp = re.compile('\s+Record Name: (.*)')
 
-        field_heading_exp = re.compile('^Field.*Field.*Length.*Descrition')
+        field_heading_exp = re.compile('^Field.*Field.*Length.*Description')
 
         opts = ["pdftotext", "-layout", "-nopgbrk", "-eol", "unix", src, '-']
         pdftext = subprocess.check_output(opts)
@@ -27,26 +27,57 @@ class PDFRecordFinder(object):
 
         for (start, end, name) in headings:
             name = name.decode('ascii', 'ignore')
-            yield (name, list(self.find_fields(iter(self.textrows[start+1:end]))))
+            yield (name, list(self.find_fields(iter(self.textrows[start+1:end]))), (start+1, end))
 
 
     def locate_heading_rows_by_field(self):
         results = []
+        record_break = []
         for (i, row) in enumerate(self.textrows):
             match = self.field_heading_exp.match(row)
             if match:
                 # work backwards until we think the header is fully copied
                 space_count_exp = re.compile('^(\s*)')
                 position = i - 1
-                last_spaces = space_count_exp.search(self.textrows[position]
+                last_spaces = 10000
                 complete = False
+                header = None
                 while not complete:
-                    position -= 1
-                    spaces = space_count_exp.search(self.textrows[position])
-                    if spaces > last_spaces:
-                        print 'HEADER', self.textrows[position:i]
-                        complete = True
+                    if len(self.textrows[position].strip()) == 0:
+                        spaces = 10000
+                    else:
+                        spaces = space_count_exp.search(self.textrows[position]).end()
 
+                    if spaces > last_spaces:
+                        header = self.textrows[position + 1:i]
+                        complete = True
+                    last_spaces = spaces
+                    position -= 1
+
+                name = ''.join(header).strip().decode('ascii','ignore')
+                results.append((i, name))
+            else:
+                # See if this row forces us to break from field reading.
+                if re.search('Record\ Layout', row):
+                    record_break.append(i)
+
+        merged = []
+        for (a, b) in zip(results, results[1:] + [(len(self.textrows), None)]):
+            end_pos = None
+
+            print a[0], record_break[0], b[0]-1
+
+            while record_break and record_break[0] < a[0]:
+                record_break = record_break[1:]
+
+            if record_break[0] < b[0]-1:
+                end_pos = record_break[0]
+                record_break = record_break[1:]
+            else:
+                end_pos = b[0]-1
+
+            merged.append( (a[0], end_pos, a[1]) )
+        return merged
 
     def locate_heading_rows(self):
         results = []
@@ -175,8 +206,8 @@ class ColumnCollector(object):
         return "<%s: %s>" % (self.__class__.__name__, map(lambda x:x if len(x) < 25 else x[:25] + '..', self.data.values()))
 
     def add(self, data):
-        if self.empty_rows > 2:
-            raise IsNextField()
+        #if self.empty_rows > 2:
+        #    raise IsNextField()
 
         if not self.data:
             self.data = dict(data)