added length checking to field matching criteria for parser
This commit is contained in:
parent
2c9551f677
commit
ad5262e37e
1 changed files with 85 additions and 18 deletions
|
@ -1,7 +1,13 @@
|
||||||
#!/usr/bin/python
|
#!/usr/bin/python
|
||||||
# coding=UTF-8
|
# coding=UTF-8
|
||||||
|
"""
|
||||||
|
Parser utility to read data from Publication 1220 and
|
||||||
|
convert it into python classes.
|
||||||
|
|
||||||
|
"""
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
class SimpleDefParser(object):
|
class SimpleDefParser(object):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
pass
|
pass
|
||||||
|
@ -35,6 +41,36 @@ class SimpleDefParser(object):
|
||||||
yield item
|
yield item
|
||||||
|
|
||||||
|
|
||||||
|
class LengthExpression(object):
|
||||||
|
import operator
|
||||||
|
REG = re.compile(r'([^\d]*)(\d+)')
|
||||||
|
OPS = {
|
||||||
|
'<': operator.lt,
|
||||||
|
'>': operator.gt,
|
||||||
|
'<=': operator.le,
|
||||||
|
'>=': operator.ge,
|
||||||
|
'=': operator.eq,
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.exp_cache = {}
|
||||||
|
|
||||||
|
def __call__(self, value, exps):
|
||||||
|
return len(exps) == sum(map(lambda x: self.check(value, x), exps))
|
||||||
|
|
||||||
|
def compile_exp(self, exp):
|
||||||
|
op, val = self.REG.match(exp).groups()
|
||||||
|
val = int(val)
|
||||||
|
return (self.OPS.get(op, None), val)
|
||||||
|
|
||||||
|
def check(self, value, exp):
|
||||||
|
if exp not in self.exp_cache:
|
||||||
|
(op, opval) = self.exp_cache[exp] = self.compile_exp(exp)
|
||||||
|
else:
|
||||||
|
(op, opval) = self.exp_cache[exp]
|
||||||
|
return op(value, opval)
|
||||||
|
|
||||||
|
|
||||||
class BaseToken(object):
|
class BaseToken(object):
|
||||||
regexp = re.compile('(.*)')
|
regexp = re.compile('(.*)')
|
||||||
|
|
||||||
|
@ -85,18 +121,42 @@ class PastedDefParser(object):
|
||||||
]
|
]
|
||||||
|
|
||||||
FIELD_TYPES = [
|
FIELD_TYPES = [
|
||||||
(fields.BlankField, {'name': [
|
(fields.BlankField, {
|
||||||
re.compile(r'^blank$'),
|
'regexp': {
|
||||||
]}),
|
'name': [
|
||||||
(fields.MoneyField, {'desc': [
|
re.compile(r'^blank$'),
|
||||||
re.compile(r'right\-justified'),
|
],
|
||||||
re.compile(r'amount'),
|
},
|
||||||
re.compile(r'zero\-filled'),
|
}),
|
||||||
]}),
|
|
||||||
(fields.TextField, {'desc': [
|
(fields.MoneyField, {
|
||||||
re.compile(r'enter blanks')
|
'regexp': {
|
||||||
]})
|
'desc': [
|
||||||
]
|
re.compile(r'right\-justified'),
|
||||||
|
re.compile(r'amount'),
|
||||||
|
re.compile(r'zero\-filled'),
|
||||||
|
],
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
|
||||||
|
(fields.TextField, {
|
||||||
|
'regexp': {
|
||||||
|
'desc': [
|
||||||
|
re.compile(r'enter blanks'),
|
||||||
|
],
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
|
||||||
|
(fields.StateField, {
|
||||||
|
'regexp': {
|
||||||
|
'desc': [
|
||||||
|
re.compile(r'state'),
|
||||||
|
re.compile(r'postal'),
|
||||||
|
],
|
||||||
|
},
|
||||||
|
'length': ['=2'],
|
||||||
|
})
|
||||||
|
]
|
||||||
|
|
||||||
def load(self, infile):
|
def load(self, infile):
|
||||||
tokens = self._tokenize(infile)
|
tokens = self._tokenize(infile)
|
||||||
|
@ -204,14 +264,21 @@ class PastedDefParser(object):
|
||||||
|
|
||||||
|
|
||||||
def _guess_field_types(self, entries):
|
def _guess_field_types(self, entries):
|
||||||
|
lengthexp = LengthExpression()
|
||||||
|
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
matches = dict(map(lambda x:(x[0],0), self.FIELD_TYPES))
|
matches = dict(map(lambda x:(x[0],0), self.FIELD_TYPES))
|
||||||
|
|
||||||
for (classtype, criteria) in self.FIELD_TYPES:
|
for (classtype, criteria) in self.FIELD_TYPES:
|
||||||
for crit_key, crit_values in criteria.items():
|
if 'length' in criteria:
|
||||||
for crit_re in crit_values:
|
if not lengthexp(int(entry['length']), criteria['length']):
|
||||||
matches[classtype] += 1 if crit_re.search(entry[crit_key]) else 0
|
continue
|
||||||
|
|
||||||
|
if 'regexp' in criteria:
|
||||||
|
for crit_key, crit_values in criteria['regexp'].items():
|
||||||
|
for crit_re in crit_values:
|
||||||
|
matches[classtype] += 1 if crit_re.search(entry[crit_key]) else 0
|
||||||
|
|
||||||
|
|
||||||
matches = list(matches.items())
|
matches = list(matches.items())
|
||||||
matches.sort(key=lambda x:x[1])
|
matches.sort(key=lambda x:x[1])
|
||||||
|
@ -230,10 +297,10 @@ class PastedDefParser(object):
|
||||||
|
|
||||||
# FIELD NAME
|
# FIELD NAME
|
||||||
if entry['name'] == 'blank':
|
if entry['name'] == 'blank':
|
||||||
add('blank%d' % blank_count)
|
add( (u'blank%d' % blank_count).ljust(40) )
|
||||||
blank_count += 1
|
blank_count += 1
|
||||||
else:
|
else:
|
||||||
add(entry['name'])
|
add(entry['name'].ljust(40))
|
||||||
|
|
||||||
add(' = ')
|
add(' = ')
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue