adding a simple parser for reading stuff from pdfs

This commit is contained in:
Binh 2012-04-05 15:19:00 -05:00
parent 97a74c09f9
commit 6e9b8041b9

129
pyaccuwage/parser.py Normal file
View file

@ -0,0 +1,129 @@
#!/usr/bin/python
# coding=UTF-8
import re
class SimpleDefParser(object):
def __init__(self):
pass
def load(self, infile):
for row in infile:
tokens = tuple(self._tokenize(row))
yield tokens
def _intify(self, x):
try:
x = int(x.strip())
except (ValueError):
pass
return x
def _tokenize(self, row):
for item in row.split(','):
item = item.strip()
if ' ' in item:
item = item.replace(' ','_')
else:
item = item.upper()
if '-' in item:
parts = map(lambda x:self._intify(x), item.split('-'))
item = reduce(lambda x,y: y-x, parts)
else:
item = self._intify(item)
yield item
class BaseToken(object):
regexp = re.compile('(.*)')
def __init__(self, value):
self.value = value
def match(self, value):
return self.regexp
def __repr__(self):
return ",".join([str(self.__class__), self.value])
class RangeToken(BaseToken):
regexp = re.compile('(\d+)-(\d+)')
class NumericToken(BaseToken):
regexp = re.compile('(\d+)')
class PastedDefParser(object):
TOKEN_TYPES = [
RangeToken,
NumericToken,
BaseToken,
]
def load(self, infile):
tokens = self._tokenize(infile)
entries = self._parse(tokens)
return entries
def _tokenize(self, data):
for item in data.replace('\n',' ').split(' '):
item = item.strip()
if len(item) == 0:
continue
for tclass in self.TOKEN_TYPES:
if tclass.regexp.match(item):
yield tclass(item)
break
def _parse(self, tokens):
# TODO group things based on strides between RangeTokens, probably
# starting with range token, then the following BaseTokens are likely
# the field name, followed by a NumericToken, then Base/Numeric tokens
# for the field's description, until then ext RangeToken is found.
results = tokens
return results
sdp = SimpleDefParser()
tokens = sdp.load([
"record type,text,1",
"payment year, year,2-5",
"corrected return indicator, 6",
])
pdp = PastedDefParser()
tokens2 = pdp.load("""
103-114 Payment
Amount 5*
12 The amount reported in this field represents payments for
Amount Code 5 in the A Record.
115-126 Payment
Amount 6*
12 The amount reported in this field represents payments for
Amount Code 6 in the A Record.
127-138 Payment
Amount 7*
12 The amount reported in this field represents payments for
Amount Code 7 in the A Record.
139-150 Payment
Amount 8*
12 The amount reported in this field represents payments f
""")