adding a simple parser for reading stuff from pdfs

2012-04-05 15:19:00 -05:00 · 2012-04-05 15:19:00 -05:00 · 6e9b8041b9
commit 6e9b8041b9
parent 97a74c09f9
1 changed files with 129 additions and 0 deletions
--- a/pyaccuwage/parser.py
+++ b/pyaccuwage/parser.py
@ -0,0 +1,129 @@
+#!/usr/bin/python
+# coding=UTF-8
+import re
+
+
+
+class SimpleDefParser(object):
+    def __init__(self):
+        pass
+
+    def load(self, infile):
+        for row in infile:
+            tokens = tuple(self._tokenize(row))
+            yield tokens
+
+    def _intify(self, x):
+        try:
+            x = int(x.strip())
+        except (ValueError):
+            pass
+        return x
+
+
+    def _tokenize(self, row):
+        for item in row.split(','):
+            item = item.strip()
+            if ' ' in item:
+                item = item.replace(' ','_')
+            else:
+                item = item.upper()
+
+            if '-' in item:
+                parts = map(lambda x:self._intify(x), item.split('-'))
+                item = reduce(lambda x,y: y-x, parts)
+            else:
+                item = self._intify(item)
+            yield item
+        
+
+class BaseToken(object):
+    regexp = re.compile('(.*)')
+
+    def __init__(self, value):
+        self.value = value
+
+    def match(self, value):
+        return self.regexp
+
+    def __repr__(self):
+        return ",".join([str(self.__class__), self.value])
+
+class RangeToken(BaseToken):
+    regexp = re.compile('(\d+)-(\d+)')
+   
+class NumericToken(BaseToken):
+    regexp = re.compile('(\d+)')
+
+
+class PastedDefParser(object):
+    TOKEN_TYPES = [
+        RangeToken,
+        NumericToken,  
+        BaseToken,
+    ]
+
+    def load(self, infile):
+        tokens = self._tokenize(infile)
+        entries = self._parse(tokens)
+        return entries
+    
+    def _tokenize(self, data):
+        for item in data.replace('\n',' ').split(' '):
+            item = item.strip()
+            if len(item) == 0:
+                continue
+            for tclass in self.TOKEN_TYPES:
+                if tclass.regexp.match(item):
+                    yield tclass(item)
+                    break
+
+    def _parse(self, tokens):
+        # TODO group things based on strides between RangeTokens, probably
+        # starting with range token, then the following BaseTokens are likely
+        # the field name, followed by a NumericToken, then Base/Numeric tokens
+        # for the field's description, until then ext RangeToken is found.
+        results = tokens
+        return results
+
+
+sdp = SimpleDefParser()
+tokens = sdp.load([
+    "record type,text,1",
+    "payment year, year,2-5",
+    "corrected return indicator, 6",
+    ])
+
+
+pdp = PastedDefParser()
+tokens2 = pdp.load("""
+103-114 Payment 
+
+Amount 5*
+
+12 The amount reported in this field represents payments for 
+
+Amount Code 5 in the “A” Record. 
+
+115-126 Payment 
+
+Amount 6*
+
+12 The amount reported in this field represents payments for 
+
+Amount Code 6 in the “A” Record. 
+
+127-138 Payment 
+
+Amount 7*
+
+12 The amount reported in this field represents payments for 
+
+Amount Code 7 in the “A” Record. 
+
+139-150 Payment 
+
+Amount 8*
+
+12 The amount reported in this field represents payments f
+""")