Parsing all the way through the pdf appears to work. Next we need

to track the beginning/ending points for each record and append continuation records onto the previous. There's some issue in the pyaccuwage-pdfparse script causing it to have problems reading the last record field in a record group. Maybe the record extractor needs to dump the last failed ColumnCollector rather than return it if it's determined to hold junk data? The record builder seems to handle everything just fine. Added a function to the field name parsing to replace ampersands with an "and" string so as not to cause problems with variable names.
2012-11-13 15:53:41 -06:00 · 2012-11-13 15:53:41 -06:00 · 1c7533973a
commit 1c7533973a
parent fe4bd20bad
3 changed files with 74 additions and 71 deletions
--- a/pyaccuwage/parser.py
+++ b/pyaccuwage/parser.py
@ -120,6 +120,8 @@ class NumericToken(BaseToken):
 class RecordBuilder(object):
    import fields

+    entry_max_length = 4
+
    TOKEN_TYPES = [
        RangeToken,
        NumericToken,
@ -176,7 +178,10 @@ class RecordBuilder(object):
    def _compile(self, entries):
        for entry in entries:

-            (f_range, f_name, f_length, f_desc) = list(entry) + ['']*(4-len(entry))
+            if len(entry) > self.entry_max_length:
+                continue
+
+            (f_range, f_name, f_length, f_desc) = list(entry) + ['']*(self.entry_max_length-len(entry))

            try:
                f_length = int(f_length)
@ -186,9 +191,11 @@ class RecordBuilder(object):

            try:
                assert f_length == RangeToken(f_range).value
-            except AssertionError:
-                import pdb
-                pdb.set_trace()
+            except AssertionError, e:
+                continue
+            except ValueError, e:
+                # bad result, skip
+                continue

            name_parts = f_name.split(' ')

@ -201,11 +208,13 @@ class RecordBuilder(object):
                required = None

            f_name = u'_'.join(map(lambda x:x.lower(), name_parts))
+            f_name = f_name.replace('&', 'and')
            f_name = re.sub(r'[^\w]','', f_name)

            yield {
                'name': f_name,
-                'desc': '(' + f_range + '). ' + f_desc,
+                'range': f_range,
+                'desc': f_desc,
                'length': f_length,
                'required': required,
            }
@ -261,9 +270,7 @@ class RecordBuilder(object):

                add("(" + ", ".join(args) + ")")

-
-            yield "".join(result)
-
+            yield "".join(result).ljust(85) + "# %s" % entry['range']


 class PastedDefParser(RecordBuilder):
--- a/pyaccuwage/pdfextract.py
+++ b/pyaccuwage/pdfextract.py
@ -33,14 +33,6 @@ class PDFRecordFinder(object):
            if match:
                results.append((i, ''.join(match.groups())))

-        """
-        results2 = []
-        for r in results:
-            if len(results2)==0 or results2[-1:][0][1] != r[1]:
-                results2.append(r)
-        results = results2
-        """
-
        merged = []
        for (a, b) in zip(results, results[1:] + [(len(self.textrows),None)]):
            merged.append( (a[0], b[0]-1, a[1]) )
@ -57,7 +49,6 @@ class PDFRecordFinder(object):
            if not row:
                continue

-
            #if cc.is_next_field(row):
            #    print len(cc.data)
            #    yield cc
@ -102,8 +93,8 @@ class PDFRecordFinder(object):
        re_multiwhite = re.compile(r'\s{2,}')

        # IF LINE DOESN'T CONTAIN MULTIPLE WHITESPACES, IT'S LIKELY NOT A TABLE
-        #if not re_multiwhite.search(row):
-        #    return None
+        if not re_multiwhite.search(row):
+            return None

        white_ranges = [0,]
        pos = 0
@ -145,6 +136,7 @@ class ColumnCollector(object):
        self.data = None
        self.column_widths = None
        self.max_data_length = 0
+        self.adjust_pad = 3
        pass

    def add(self, data):
@ -190,11 +182,12 @@ class ColumnCollector(object):
                adjusted_data[col_id] = value.strip()
            else:
                for col_start, col_end in self.column_widths.items():
-                    if col_start <= col_id and (col_end) >= col_id:
+                    if (col_start - self.adjust_pad) <= col_id and (col_end + self.adjust_pad) >= col_id:
                        if col_start in adjusted_data:
                            adjusted_data[col_start] += ' ' + value.strip()
                        else:
                            adjusted_data[col_start] = value.strip()
+
        return adjusted_data.items()


@ -231,7 +224,7 @@ class ColumnCollector(object):
            then this is probably a continuation.
        """

-        if self.data:
+        if self.data and data:
            keys = dict(self.column_widths).keys()
            keys.sort()
            keys += [None]
@ -244,20 +237,11 @@ class ColumnCollector(object):

                position = keys.index(first_key)
                max_length = keys[position + 1]
-                print 'test', len(first_value), max_length
                if max_length:
                    return len(first_value) > max_length or len(data) == self.max_data_length

        return False

-            #for key, nextkey in map(lambda x:(keys[x], keys[x+1]), range(len(keys)-1)):
-            #    print 'key', key, nextkey
-
-        first_key, first_value = dict(data).items()[0]
-        if self.data:
-            #print self.data.keys()[0], first_key, first_value, self.column_widths
-            return self.data.keys()[0] == first_key # and len(first_value) > self.column_widths[first_key]
-        return False

    @property
    def tuple(self):
--- a/scripts/pyaccuwage-pdfparse
+++ b/scripts/pyaccuwage-pdfparse
@ -32,8 +32,20 @@ doc = PDFRecordFinder(source_file)
 records = doc.records()
 builder = RecordBuilder()

+def record_begins_at(record):
+    return int(record[1][1].data.values()[0].split('-')[0], 10)
+
+def record_ends_at(record):
+    return record[1][-1].data
+    return int(record[1][-1].data.values()[0].split('-')[-1], 10)
+
 for rec in records:

+    print record_begins_at(rec) #, 'to', record_ends_at(rec)
+    # FIXME record_ends_at is randomly exploding due to record data being
+    # a lump of text and not necessarily a field entry. I assume
+    # this is cleaned out by the record builder class.
+
    sys.stdout.write("class %s(object):\n" % re.sub('[^\w]','',rec[0]))

    for field in builder.load(map(lambda x:x.tuple, rec[1][1:])):