diff --git a/pyaccuwage/parser.py b/pyaccuwage/parser.py index 1e99f0e..d00155e 100644 --- a/pyaccuwage/parser.py +++ b/pyaccuwage/parser.py @@ -1,7 +1,13 @@ #!/usr/bin/python # coding=UTF-8 +""" +Parser utility to read data from Publication 1220 and +convert it into python classes. + +""" import re + class SimpleDefParser(object): def __init__(self): pass @@ -35,6 +41,36 @@ class SimpleDefParser(object): yield item +class LengthExpression(object): + import operator + REG = re.compile(r'([^\d]*)(\d+)') + OPS = { + '<': operator.lt, + '>': operator.gt, + '<=': operator.le, + '>=': operator.ge, + '=': operator.eq, + } + + def __init__(self): + self.exp_cache = {} + + def __call__(self, value, exps): + return len(exps) == sum(map(lambda x: self.check(value, x), exps)) + + def compile_exp(self, exp): + op, val = self.REG.match(exp).groups() + val = int(val) + return (self.OPS.get(op, None), val) + + def check(self, value, exp): + if exp not in self.exp_cache: + (op, opval) = self.exp_cache[exp] = self.compile_exp(exp) + else: + (op, opval) = self.exp_cache[exp] + return op(value, opval) + + class BaseToken(object): regexp = re.compile('(.*)') @@ -85,18 +121,42 @@ class PastedDefParser(object): ] FIELD_TYPES = [ - (fields.BlankField, {'name': [ - re.compile(r'^blank$'), - ]}), - (fields.MoneyField, {'desc': [ - re.compile(r'right\-justified'), - re.compile(r'amount'), - re.compile(r'zero\-filled'), - ]}), - (fields.TextField, {'desc': [ - re.compile(r'enter blanks') - ]}) - ] + (fields.BlankField, { + 'regexp': { + 'name': [ + re.compile(r'^blank$'), + ], + }, + }), + + (fields.MoneyField, { + 'regexp': { + 'desc': [ + re.compile(r'right\-justified'), + re.compile(r'amount'), + re.compile(r'zero\-filled'), + ], + }, + }), + + (fields.TextField, { + 'regexp': { + 'desc': [ + re.compile(r'enter blanks'), + ], + }, + }), + + (fields.StateField, { + 'regexp': { + 'desc': [ + re.compile(r'state'), + re.compile(r'postal'), + ], + }, + 'length': ['=2'], + }) + ] def load(self, infile): tokens = self._tokenize(infile) @@ -204,14 +264,21 @@ class PastedDefParser(object): def _guess_field_types(self, entries): + lengthexp = LengthExpression() + for entry in entries: matches = dict(map(lambda x:(x[0],0), self.FIELD_TYPES)) for (classtype, criteria) in self.FIELD_TYPES: - for crit_key, crit_values in criteria.items(): - for crit_re in crit_values: - matches[classtype] += 1 if crit_re.search(entry[crit_key]) else 0 - + if 'length' in criteria: + if not lengthexp(int(entry['length']), criteria['length']): + continue + + if 'regexp' in criteria: + for crit_key, crit_values in criteria['regexp'].items(): + for crit_re in crit_values: + matches[classtype] += 1 if crit_re.search(entry[crit_key]) else 0 + matches = list(matches.items()) matches.sort(key=lambda x:x[1]) @@ -230,10 +297,10 @@ class PastedDefParser(object): # FIELD NAME if entry['name'] == 'blank': - add('blank%d' % blank_count) + add( (u'blank%d' % blank_count).ljust(40) ) blank_count += 1 else: - add(entry['name']) + add(entry['name'].ljust(40)) add(' = ')