added length checking to field matching criteria for parser

This commit is contained in:
Binh 2012-05-08 14:08:39 -05:00
parent 2c9551f677
commit ad5262e37e

View file

@ -1,7 +1,13 @@
#!/usr/bin/python #!/usr/bin/python
# coding=UTF-8 # coding=UTF-8
"""
Parser utility to read data from Publication 1220 and
convert it into python classes.
"""
import re import re
class SimpleDefParser(object): class SimpleDefParser(object):
def __init__(self): def __init__(self):
pass pass
@ -35,6 +41,36 @@ class SimpleDefParser(object):
yield item yield item
class LengthExpression(object):
import operator
REG = re.compile(r'([^\d]*)(\d+)')
OPS = {
'<': operator.lt,
'>': operator.gt,
'<=': operator.le,
'>=': operator.ge,
'=': operator.eq,
}
def __init__(self):
self.exp_cache = {}
def __call__(self, value, exps):
return len(exps) == sum(map(lambda x: self.check(value, x), exps))
def compile_exp(self, exp):
op, val = self.REG.match(exp).groups()
val = int(val)
return (self.OPS.get(op, None), val)
def check(self, value, exp):
if exp not in self.exp_cache:
(op, opval) = self.exp_cache[exp] = self.compile_exp(exp)
else:
(op, opval) = self.exp_cache[exp]
return op(value, opval)
class BaseToken(object): class BaseToken(object):
regexp = re.compile('(.*)') regexp = re.compile('(.*)')
@ -85,17 +121,41 @@ class PastedDefParser(object):
] ]
FIELD_TYPES = [ FIELD_TYPES = [
(fields.BlankField, {'name': [ (fields.BlankField, {
re.compile(r'^blank$'), 'regexp': {
]}), 'name': [
(fields.MoneyField, {'desc': [ re.compile(r'^blank$'),
re.compile(r'right\-justified'), ],
re.compile(r'amount'), },
re.compile(r'zero\-filled'), }),
]}),
(fields.TextField, {'desc': [ (fields.MoneyField, {
re.compile(r'enter blanks') 'regexp': {
]}) 'desc': [
re.compile(r'right\-justified'),
re.compile(r'amount'),
re.compile(r'zero\-filled'),
],
},
}),
(fields.TextField, {
'regexp': {
'desc': [
re.compile(r'enter blanks'),
],
},
}),
(fields.StateField, {
'regexp': {
'desc': [
re.compile(r'state'),
re.compile(r'postal'),
],
},
'length': ['=2'],
})
] ]
def load(self, infile): def load(self, infile):
@ -204,13 +264,20 @@ class PastedDefParser(object):
def _guess_field_types(self, entries): def _guess_field_types(self, entries):
lengthexp = LengthExpression()
for entry in entries: for entry in entries:
matches = dict(map(lambda x:(x[0],0), self.FIELD_TYPES)) matches = dict(map(lambda x:(x[0],0), self.FIELD_TYPES))
for (classtype, criteria) in self.FIELD_TYPES: for (classtype, criteria) in self.FIELD_TYPES:
for crit_key, crit_values in criteria.items(): if 'length' in criteria:
for crit_re in crit_values: if not lengthexp(int(entry['length']), criteria['length']):
matches[classtype] += 1 if crit_re.search(entry[crit_key]) else 0 continue
if 'regexp' in criteria:
for crit_key, crit_values in criteria['regexp'].items():
for crit_re in crit_values:
matches[classtype] += 1 if crit_re.search(entry[crit_key]) else 0
matches = list(matches.items()) matches = list(matches.items())
@ -230,10 +297,10 @@ class PastedDefParser(object):
# FIELD NAME # FIELD NAME
if entry['name'] == 'blank': if entry['name'] == 'blank':
add('blank%d' % blank_count) add( (u'blank%d' % blank_count).ljust(40) )
blank_count += 1 blank_count += 1
else: else:
add(entry['name']) add(entry['name'].ljust(40))
add(' = ') add(' = ')