adding new pdf extract capability

2012-07-10 15:24:13 -05:00 · 2012-07-10 15:24:13 -05:00 · e8145c5616
commit e8145c5616
parent b77b80e485
4 changed files with 174 additions and 134 deletions
--- a/pyaccuwage/parser.py
+++ b/pyaccuwage/parser.py
@ -96,6 +96,8 @@ class RangeToken(BaseToken):

    @property
    def value(self):
+        if '-' not in self._value:
+            return int(self._value)
        return reduce(lambda x,y: y-x, map(int, self._value.split('-')))+1

    @property
@ -115,8 +117,7 @@ class NumericToken(BaseToken):
        return int(self._value)
        

-
-class PastedDefParser(object):
+class RecordBuilder(object):
    import fields

    TOKEN_TYPES = [
@ -163,6 +164,102 @@ class PastedDefParser(object):
        })
    ]

+    def load(self, entries):
+        # EXPECTS ENTRIES TO BE IN THE FORM OF
+        # [('1-5', 'Field Name', '5', 'Description of field.'),...]
+        entries = self._compile(entries)
+        entries = self._guess_field_types(entries)
+        entries = self._convert_to_records(entries)
+        return entries
+ 
+
+    def _compile(self, entries):
+        for (f_range, f_name, f_length, f_desc) in entries:
+            f_length = int(f_length)
+            try:
+                assert f_length == RangeToken(f_range).value
+            except AssertionError:
+                import pdb
+                pdb.set_trace()
+
+            name_parts = f_name.split(' ')
+
+            if name_parts[-1].lower() == '(optional)':
+                name_parts = name_parts[0:-1]
+                required = False
+            elif re.search('required', f_desc, re.IGNORECASE):
+                required = True
+            else:
+                required = None
+                 
+            f_name = u'_'.join(map(lambda x:x.lower(), name_parts))
+            f_name = re.sub(r'[^\w]','', f_name)
+            
+            yield {
+                'name': f_name,
+                'desc': '(' + f_range + '). ' + f_desc,
+                'length': f_length,
+                'required': required,
+            }
+            
+
+    def _guess_field_types(self, entries):
+        lengthexp = LengthExpression()
+
+        for entry in entries:
+            matches = dict(map(lambda x:(x[0],0), self.FIELD_TYPES))
+            
+            for (classtype, criteria) in self.FIELD_TYPES:
+                if 'length' in criteria:
+                    if not lengthexp(int(entry['length']), criteria['length']):
+                        continue
+            
+                if 'regexp' in criteria:
+                    for crit_key, crit_values in criteria['regexp'].items():
+                        for crit_re in crit_values:
+                            matches[classtype] += 1 if crit_re.search(entry[crit_key]) else 0
+                    
+            
+            matches = list(matches.items())
+            matches.sort(key=lambda x:x[1])
+            
+            matches_found = True if sum(map(lambda x:x[1], matches)) > 0 else False
+
+            entry['guessed_type'] = matches[-1][0] if matches_found else self.fields.TextField
+            yield entry
+     
+    def _convert_to_records(self, entries):
+        blank_count = 1
+        for entry in entries:
+            result = []
+            add = result.append
+
+            # FIELD NAME
+            if entry['name'] == 'blank':
+                add( (u'blank%d' % blank_count).ljust(40) )
+                blank_count += 1
+            else:
+                add(entry['name'].ljust(40))
+
+            add(' = ')
+       
+            if entry['guessed_type']:
+                add(entry['guessed_type'].__name__)
+                
+                args = []
+                args.append("max_length=%d" % entry['length'])
+                if entry['required'] != None:
+                    args.append("required=%s" % ('True' if entry['required'] else 'False'))
+
+                add("(" + ", ".join(args) + ")")
+
+
+            yield "".join(result)
+
+
+
+class PastedDefParser(RecordBuilder):
+  
    def load(self, infile):
        tokens = self._tokenize(infile)
        entries = self._parse(tokens)
@ -278,6 +375,7 @@ class PastedDefParser(object):
            })
    

+    """
    def _guess_field_types(self, entries):
        lengthexp = LengthExpression()

@ -303,7 +401,8 @@ class PastedDefParser(object):
            entry['guessed_type'] = matches[-1][0] if matches_found else self.fields.TextField
            yield entry
    
-
+    """
+    """
    def _convert_to_records(self, entries):
        blank_count = 1
        for entry in entries:
@ -332,4 +431,4 @@ class PastedDefParser(object):

            yield "".join(result)

-
+    """
--- a/pyaccuwage/pdfextract.py
+++ b/pyaccuwage/pdfextract.py
@ -18,35 +18,12 @@ class PDFRecordFinder(object):
        self.textrows = pdftext.split('\n')
        self.heading_exp = heading_exp

-    """
-    def columns(self):
-        results = []
-        cc = ColumnCollector()
-        for heading, group in self.record_grouping():
-            print "HEADING", heading
-            for row in group:
-                if cc.is_next_field(row):
-                    yield cc
-                    cc = ColumnCollector()
-                #print row
-                try:
-                    cc.add(row)
-                except UnknownColumn, e:
-                    results.append(cc)
-                    cc = ColumnCollector()

-
-
-    def record_grouping(self):
-        row_iter = iter(self.textrows)
-        i = 0
-        for row in row_iter:
-            i += 1
-            match = self.heading_exp.match(row)
-            if match:
-                yield (match.groups(), self.extract_record_columns(row_iter))
-
-    """
+    def records(self):
+        headings = self.locate_heading_rows()
+        for (start, end, name) in headings:
+            name = name.decode('ascii', 'ignore')
+            yield (name, list(self.find_fields(iter(self.textrows[start+1:end]))))


    def locate_heading_rows(self):
@ -56,45 +33,32 @@ class PDFRecordFinder(object):
            if match:
                if not ''.join(match.groups()).lower().endswith('(continued)'):
                    results.append((i, ''.join(match.groups())))
-        return results
        
+        merged = []
+        for (a, b) in zip(results, results[1:] + [(len(self.textrows),None)]):
+            merged.append( (a[0], b[0]-1, a[1]) )

-    def records2(self):
-        row_iter = iter(self.textrows)
-        record = True
-        while record:
-            record = self.extract_record(row_iter)
-            yield record
-
-
-    def extract_record(self, row_iter):
-        heading = self.find_heading(row_iter)
-        fields = self.find_fields(row_iter)
-        return heading, list(fields)
-       
-
-    def find_heading(self, row_iter):
-        for row in row_iter:
-            heading_match = self.heading_exp.match(row)
-            if heading_match:
-                return heading_match.groups()
+        return merged


    def find_fields(self, row_iter):
        cc = ColumnCollector()
        for r in row_iter:
-            row = self.extract_columns_from_row(r)
+            row = self.extract_columns_from_row(r.decode('UTF-8'))
+            
            if not row:
                continue
            
            if cc.is_next_field(row):
                yield cc
                cc = ColumnCollector()
+            
            try:
                cc.add(row)
+            
            except UnknownColumn, e:
-                print 'UNKNOWN COLUMN', row
                raise StopIteration
+        yield cc


    def extract_columns_from_row(self, row):
@ -124,7 +88,7 @@ class PDFRecordFinder(object):
                end = white_iter.next()
                if start != end:
                    row_result.append(
-                        (start, row[start:end])
+                        (start, row[start:end].encode('ascii','ignore'))
                    )
                
            except StopIteration:
@ -133,78 +97,6 @@ class PDFRecordFinder(object):
        return row_result
         
   
-
-    def extract_row_columns(self, row_iter):
-        re_multiwhite = re.compile(r'\s{2,}')
-        full_width_text_count = 0
-
-        #for r in row_iter:
-        row = None
-        while not row:
-            row = row_iter.next()
-            row = row.decode('UTF-8')
-        
-            # IF LINE CONTAINS MULTIPLE WHITESPACE, IT IS PROBABLY A TABLE
-            if not re_multiwhite.search(row):
-                full_width_text_count += 1
-                row = None
-
-        if True:
-            white_ranges = [0,]
-            pos = 0
-            match = True
-            while pos < len(row):
-                match = re_multiwhite.search(row[pos:])
-                if match:
-                    white_ranges.append(pos + match.start())
-                    white_ranges.append(pos + match.end())
-                    pos += match.end()
-                else:
-                    white_ranges.append(len(row))
-                    pos = len(row)
-
-
-            row_result = []
-            white_iter = iter(white_ranges)
-            while white_iter:
-                try:
-                    start = white_iter.next()
-                    end = white_iter.next()
-                    if start != end:
-                        row_result.append(
-                            (start, row[start:end])
-                        )
-                
-                except StopIteration:
-                    white_iter = None
-            
-            #print white_ranges
-            return row_result
-            #yield row_result
-            #result.append(row_result)
-             
-
-            """
-            row_result = []
-            pos = 0
-            while pos < len(row):
-                match = re_multiwhite.search(row[pos:])
-                if match and match.start() > 0:
-                    row_result.append((
-                        pos,
-                        row[pos:pos+match.start()],))
-                    pos += match.end()
-                else:
-                    if match:
-                        pos += match.end()
-                    row_result.append((pos,row[pos:]))
-                    pos += len(row)
-            result.append(row_result)
-            row_result = []
-            """
-        #return result
-
-
 class UnknownColumn(Exception):
    pass

@ -246,6 +138,8 @@ class ColumnCollector(object):
            return self.data.keys()[0] == first_key
        return False

-        
+    @property
+    def tuple(self):
+        return tuple(map(lambda k:self.data[k], sorted(self.data.keys())))


--- a/scripts/pyaccuwage-pdfparse
+++ b/scripts/pyaccuwage-pdfparse
@ -0,0 +1,47 @@
+#!/usr/bin/python
+from pyaccuwage.parser import RecordBuilder
+from pyaccuwage.pdfexport import PDFRecordFinder
+import argparse
+import sys
+
+parser = argparse.ArgumentParser(description="Parse and convert contents of IRS files into pyaccuwage e-file classes.")
+parser.add_argument("-f", "--full", help="Generate full python file, including related imports.", action="store_true")
+parser.add_argument("-i", "--input", metavar="file", type=argparse.FileType('r'), help="Source PDF file, ie: p1220.pdf")
+
+
+args = parser.parse_args()
+
+"""
+lines = []
+for x in sys.stdin.readlines():
+    lines.append(x)
+
+pdp = PastedDefParser()
+tokens = pdp.load("".join(lines))
+"""
+def generate_imports():
+    return "\n".join([
+        "from pyaccuwage import model",
+        "from pyaccuwage.fields import *",
+        "",
+        "",
+        ])
+
+def generate_class_begin(name):
+    return "class %s(mode.Model):\n" % name
+
+if args.full:
+    sys.stdout.write(generate_imports())
+
+if args.classname:
+    classname = args.classname
+else:
+    classname = "GeneratedRecord"
+
+sys.stdout.write(generate_class_begin(classname))
+
+
+for x in tokens:
+    sys.stdout.write('\t' + x + '\n')
+
+
--- a/setup.py
+++ b/setup.py
@ -1,7 +1,7 @@
 from distutils.core import setup
 setup(name='pyaccuwage',
-    version='0.0.6',
+    version='0.0.7',
    packages=['pyaccuwage'],
-    scripts=['scripts/pyaccuwage-parse'],
+    scripts=['scripts/pyaccuwage-parse', 'scripts/pyaccuwage-pdfparse'],
    zip_safe=True,
    )