adding a simple parser for reading stuff from pdfs

2012-04-05 15:19:00 -05:00 · 2012-04-05 15:19:00 -05:00 · 6e9b8041b9
commit 6e9b8041b9
parent 97a74c09f9
1 changed files with 129 additions and 0 deletions
--- a/pyaccuwage/parser.py
+++ b/pyaccuwage/parser.py
@ -0,0 +1,129 @@
 #!/usr/bin/python
 # coding=UTF-8
 import re
 class SimpleDefParser(object):
    def __init__(self):
        pass
    def load(self, infile):
        for row in infile:
            tokens = tuple(self._tokenize(row))
            yield tokens
    def _intify(self, x):
        try:
            x = int(x.strip())
        except (ValueError):
            pass
        return x
    def _tokenize(self, row):
        for item in row.split(','):
            item = item.strip()
            if ' ' in item:
                item = item.replace(' ','_')
            else:
                item = item.upper()
            if '-' in item:
                parts = map(lambda x:self._intify(x), item.split('-'))
                item = reduce(lambda x,y: y-x, parts)
            else:
                item = self._intify(item)
            yield item
 class BaseToken(object):
    regexp = re.compile('(.*)')
    def __init__(self, value):
        self.value = value
    def match(self, value):
        return self.regexp
    def __repr__(self):
        return ",".join([str(self.__class__), self.value])
 class RangeToken(BaseToken):
    regexp = re.compile('(\d+)-(\d+)')
 class NumericToken(BaseToken):
    regexp = re.compile('(\d+)')
 class PastedDefParser(object):
    TOKEN_TYPES = [
        RangeToken,
        NumericToken,  
        BaseToken,
    ]
    def load(self, infile):
        tokens = self._tokenize(infile)
        entries = self._parse(tokens)
        return entries
    def _tokenize(self, data):
        for item in data.replace('\n',' ').split(' '):
            item = item.strip()
            if len(item) == 0:
                continue
            for tclass in self.TOKEN_TYPES:
                if tclass.regexp.match(item):
                    yield tclass(item)
                    break
    def _parse(self, tokens):
        # TODO group things based on strides between RangeTokens, probably
        # starting with range token, then the following BaseTokens are likely
        # the field name, followed by a NumericToken, then Base/Numeric tokens
        # for the field's description, until then ext RangeToken is found.
        results = tokens
        return results
 sdp = SimpleDefParser()
 tokens = sdp.load([
    "record type,text,1",
    "payment year, year,2-5",
    "corrected return indicator, 6",
    ])
 pdp = PastedDefParser()
 tokens2 = pdp.load("""
 103-114 Payment 
 Amount 5*
 12 The amount reported in this field represents payments for 
 Amount Code 5 in the “A” Record. 
 115-126 Payment 
 Amount 6*
 12 The amount reported in this field represents payments for 
 Amount Code 6 in the “A” Record. 
 127-138 Payment 
 Amount 7*
 12 The amount reported in this field represents payments for 
 Amount Code 7 in the “A” Record. 
 139-150 Payment 
 Amount 8*
 12 The amount reported in this field represents payments f
 """)