added length checking to field matching criteria for parser

This commit is contained in:
Binh 2012-05-08 14:08:39 -05:00
parent 2c9551f677
commit ad5262e37e

View file

@ -1,7 +1,13 @@
#!/usr/bin/python
# coding=UTF-8
"""
Parser utility to read data from Publication 1220 and
convert it into python classes.
"""
import re
class SimpleDefParser(object):
def __init__(self):
pass
@ -35,6 +41,36 @@ class SimpleDefParser(object):
yield item
class LengthExpression(object):
import operator
REG = re.compile(r'([^\d]*)(\d+)')
OPS = {
'<': operator.lt,
'>': operator.gt,
'<=': operator.le,
'>=': operator.ge,
'=': operator.eq,
}
def __init__(self):
self.exp_cache = {}
def __call__(self, value, exps):
return len(exps) == sum(map(lambda x: self.check(value, x), exps))
def compile_exp(self, exp):
op, val = self.REG.match(exp).groups()
val = int(val)
return (self.OPS.get(op, None), val)
def check(self, value, exp):
if exp not in self.exp_cache:
(op, opval) = self.exp_cache[exp] = self.compile_exp(exp)
else:
(op, opval) = self.exp_cache[exp]
return op(value, opval)
class BaseToken(object):
regexp = re.compile('(.*)')
@ -85,17 +121,41 @@ class PastedDefParser(object):
]
FIELD_TYPES = [
(fields.BlankField, {'name': [
(fields.BlankField, {
'regexp': {
'name': [
re.compile(r'^blank$'),
]}),
(fields.MoneyField, {'desc': [
],
},
}),
(fields.MoneyField, {
'regexp': {
'desc': [
re.compile(r'right\-justified'),
re.compile(r'amount'),
re.compile(r'zero\-filled'),
]}),
(fields.TextField, {'desc': [
re.compile(r'enter blanks')
]})
],
},
}),
(fields.TextField, {
'regexp': {
'desc': [
re.compile(r'enter blanks'),
],
},
}),
(fields.StateField, {
'regexp': {
'desc': [
re.compile(r'state'),
re.compile(r'postal'),
],
},
'length': ['=2'],
})
]
def load(self, infile):
@ -204,11 +264,18 @@ class PastedDefParser(object):
def _guess_field_types(self, entries):
lengthexp = LengthExpression()
for entry in entries:
matches = dict(map(lambda x:(x[0],0), self.FIELD_TYPES))
for (classtype, criteria) in self.FIELD_TYPES:
for crit_key, crit_values in criteria.items():
if 'length' in criteria:
if not lengthexp(int(entry['length']), criteria['length']):
continue
if 'regexp' in criteria:
for crit_key, crit_values in criteria['regexp'].items():
for crit_re in crit_values:
matches[classtype] += 1 if crit_re.search(entry[crit_key]) else 0
@ -230,10 +297,10 @@ class PastedDefParser(object):
# FIELD NAME
if entry['name'] == 'blank':
add('blank%d' % blank_count)
add( (u'blank%d' % blank_count).ljust(40) )
blank_count += 1
else:
add(entry['name'])
add(entry['name'].ljust(40))
add(' = ')