adding a simple parser for reading stuff from pdfs
This commit is contained in:
parent
97a74c09f9
commit
6e9b8041b9
1 changed files with 129 additions and 0 deletions
129
pyaccuwage/parser.py
Normal file
129
pyaccuwage/parser.py
Normal file
|
@ -0,0 +1,129 @@
|
|||
#!/usr/bin/python
|
||||
# coding=UTF-8
|
||||
import re
|
||||
|
||||
|
||||
|
||||
class SimpleDefParser(object):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def load(self, infile):
|
||||
for row in infile:
|
||||
tokens = tuple(self._tokenize(row))
|
||||
yield tokens
|
||||
|
||||
def _intify(self, x):
|
||||
try:
|
||||
x = int(x.strip())
|
||||
except (ValueError):
|
||||
pass
|
||||
return x
|
||||
|
||||
|
||||
def _tokenize(self, row):
|
||||
for item in row.split(','):
|
||||
item = item.strip()
|
||||
if ' ' in item:
|
||||
item = item.replace(' ','_')
|
||||
else:
|
||||
item = item.upper()
|
||||
|
||||
if '-' in item:
|
||||
parts = map(lambda x:self._intify(x), item.split('-'))
|
||||
item = reduce(lambda x,y: y-x, parts)
|
||||
else:
|
||||
item = self._intify(item)
|
||||
yield item
|
||||
|
||||
|
||||
class BaseToken(object):
|
||||
regexp = re.compile('(.*)')
|
||||
|
||||
def __init__(self, value):
|
||||
self.value = value
|
||||
|
||||
def match(self, value):
|
||||
return self.regexp
|
||||
|
||||
def __repr__(self):
|
||||
return ",".join([str(self.__class__), self.value])
|
||||
|
||||
class RangeToken(BaseToken):
|
||||
regexp = re.compile('(\d+)-(\d+)')
|
||||
|
||||
class NumericToken(BaseToken):
|
||||
regexp = re.compile('(\d+)')
|
||||
|
||||
|
||||
class PastedDefParser(object):
|
||||
TOKEN_TYPES = [
|
||||
RangeToken,
|
||||
NumericToken,
|
||||
BaseToken,
|
||||
]
|
||||
|
||||
def load(self, infile):
|
||||
tokens = self._tokenize(infile)
|
||||
entries = self._parse(tokens)
|
||||
return entries
|
||||
|
||||
def _tokenize(self, data):
|
||||
for item in data.replace('\n',' ').split(' '):
|
||||
item = item.strip()
|
||||
if len(item) == 0:
|
||||
continue
|
||||
for tclass in self.TOKEN_TYPES:
|
||||
if tclass.regexp.match(item):
|
||||
yield tclass(item)
|
||||
break
|
||||
|
||||
def _parse(self, tokens):
|
||||
# TODO group things based on strides between RangeTokens, probably
|
||||
# starting with range token, then the following BaseTokens are likely
|
||||
# the field name, followed by a NumericToken, then Base/Numeric tokens
|
||||
# for the field's description, until then ext RangeToken is found.
|
||||
results = tokens
|
||||
return results
|
||||
|
||||
|
||||
sdp = SimpleDefParser()
|
||||
tokens = sdp.load([
|
||||
"record type,text,1",
|
||||
"payment year, year,2-5",
|
||||
"corrected return indicator, 6",
|
||||
])
|
||||
|
||||
|
||||
pdp = PastedDefParser()
|
||||
tokens2 = pdp.load("""
|
||||
103-114 Payment
|
||||
|
||||
Amount 5*
|
||||
|
||||
12 The amount reported in this field represents payments for
|
||||
|
||||
Amount Code 5 in the “A” Record.
|
||||
|
||||
115-126 Payment
|
||||
|
||||
Amount 6*
|
||||
|
||||
12 The amount reported in this field represents payments for
|
||||
|
||||
Amount Code 6 in the “A” Record.
|
||||
|
||||
127-138 Payment
|
||||
|
||||
Amount 7*
|
||||
|
||||
12 The amount reported in this field represents payments for
|
||||
|
||||
Amount Code 7 in the “A” Record.
|
||||
|
||||
139-150 Payment
|
||||
|
||||
Amount 8*
|
||||
|
||||
12 The amount reported in this field represents payments f
|
||||
""")
|
Loading…
Add table
Add a link
Reference in a new issue