diff --git a/pyaccuwage/parser.py b/pyaccuwage/parser.py new file mode 100644 index 0000000..bc62dd4 --- /dev/null +++ b/pyaccuwage/parser.py @@ -0,0 +1,129 @@ +#!/usr/bin/python +# coding=UTF-8 +import re + + + +class SimpleDefParser(object): + def __init__(self): + pass + + def load(self, infile): + for row in infile: + tokens = tuple(self._tokenize(row)) + yield tokens + + def _intify(self, x): + try: + x = int(x.strip()) + except (ValueError): + pass + return x + + + def _tokenize(self, row): + for item in row.split(','): + item = item.strip() + if ' ' in item: + item = item.replace(' ','_') + else: + item = item.upper() + + if '-' in item: + parts = map(lambda x:self._intify(x), item.split('-')) + item = reduce(lambda x,y: y-x, parts) + else: + item = self._intify(item) + yield item + + +class BaseToken(object): + regexp = re.compile('(.*)') + + def __init__(self, value): + self.value = value + + def match(self, value): + return self.regexp + + def __repr__(self): + return ",".join([str(self.__class__), self.value]) + +class RangeToken(BaseToken): + regexp = re.compile('(\d+)-(\d+)') + +class NumericToken(BaseToken): + regexp = re.compile('(\d+)') + + +class PastedDefParser(object): + TOKEN_TYPES = [ + RangeToken, + NumericToken, + BaseToken, + ] + + def load(self, infile): + tokens = self._tokenize(infile) + entries = self._parse(tokens) + return entries + + def _tokenize(self, data): + for item in data.replace('\n',' ').split(' '): + item = item.strip() + if len(item) == 0: + continue + for tclass in self.TOKEN_TYPES: + if tclass.regexp.match(item): + yield tclass(item) + break + + def _parse(self, tokens): + # TODO group things based on strides between RangeTokens, probably + # starting with range token, then the following BaseTokens are likely + # the field name, followed by a NumericToken, then Base/Numeric tokens + # for the field's description, until then ext RangeToken is found. + results = tokens + return results + + +sdp = SimpleDefParser() +tokens = sdp.load([ + "record type,text,1", + "payment year, year,2-5", + "corrected return indicator, 6", + ]) + + +pdp = PastedDefParser() +tokens2 = pdp.load(""" +103-114 Payment + +Amount 5* + +12 The amount reported in this field represents payments for + +Amount Code 5 in the “A” Record. + +115-126 Payment + +Amount 6* + +12 The amount reported in this field represents payments for + +Amount Code 6 in the “A” Record. + +127-138 Payment + +Amount 7* + +12 The amount reported in this field represents payments for + +Amount Code 7 in the “A” Record. + +139-150 Payment + +Amount 8* + +12 The amount reported in this field represents payments f +""")