Parser is mostly working, there's an issue with the last grouping of tokens

not being parsed. This can probably fixed by yielding an end-marker from the
tokenizer generator so the compiler knows to clear out the last item.
This commit is contained in:
Binh 2012-04-13 14:39:02 -05:00
parent 6e9b8041b9
commit 027b44b65c

View file

@ -41,31 +41,53 @@ class BaseToken(object):
regexp = re.compile('(.*)') regexp = re.compile('(.*)')
def __init__(self, value): def __init__(self, value):
self.value = value self._value = value
def match(self, value): def match(self, value):
return self.regexp return self.regexp
def __repr__(self): def __repr__(self):
return ",".join([str(self.__class__), self.value]) return ",".join([str(self.__class__), self._value])
class StringToken(BaseToken):
regexp = re.compile('(.*)')
@property
def value(self):
return self._value.decode('ascii','ignore')
class RangeToken(BaseToken): class RangeToken(BaseToken):
regexp = re.compile('(\d+)-(\d+)') regexp = re.compile('(\d+)-(\d+)')
@property
def value(self):
return reduce(lambda x,y: y-x, map(int, self._value.split('-')))+1
@property
def end_position(self):
return int(self._value.split('-')[1])
class NumericToken(BaseToken): class NumericToken(BaseToken):
regexp = re.compile('(\d+)') regexp = re.compile('^(\d+)$')
@property
def value(self):
return int(self._value)
class PastedDefParser(object): class PastedDefParser(object):
TOKEN_TYPES = [ TOKEN_TYPES = [
RangeToken, RangeToken,
NumericToken, NumericToken,
BaseToken, StringToken,
] ]
def load(self, infile): def load(self, infile):
tokens = self._tokenize(infile) tokens = self._tokenize(infile)
entries = self._parse(tokens) entries = self._parse(tokens)
#entries = self._compile(entries)
return entries return entries
def _tokenize(self, data): def _tokenize(self, data):
@ -82,10 +104,85 @@ class PastedDefParser(object):
# TODO group things based on strides between RangeTokens, probably # TODO group things based on strides between RangeTokens, probably
# starting with range token, then the following BaseTokens are likely # starting with range token, then the following BaseTokens are likely
# the field name, followed by a NumericToken, then Base/Numeric tokens # the field name, followed by a NumericToken, then Base/Numeric tokens
# for the field's description, until then ext RangeToken is found. # for the field's description, until then next RangeToken is found.
results = tokens groups = []
current_range = None
current_name = []
current_length = None
current_desc = []
state = 'range'
byte_pos = None
# COLLECT TOKENS INTO GROUPS
for token in tokens:
"""
if byte_pos == None:
# START. IF byte_pos IS NONE, WE ASSUME WE'RE STARTING WITH
# A NEW SERIES OF FIELDS. THE FIRST VALUE WHICH IS A RANGE
# OR A NUMERIC FIELD IS ASSUMED TO BE THE FIELD POSITION/RANGE.
if isinstance(token, RangeToken):
byte_pos = token.end_position + 1
elif isinstance(token, NumericToken):
byte_pos = token.value + 1
"""
if isinstance(token, NumericToken):
# THIS MAY BE A RANGE, IF THE VALUE MATCHES THE CURRENT byte_pos
# THEN WE ASSUME THIS TO BE A SINGLE BYTE RANGE DEFINITION,
# AND WE WILL TREAT IT AS A NEW RECORD.
if byte_pos == None or token.value == byte_pos:
# UPDATE RANGE POSITION
byte_pos = token.value + 1
# CONVERT TOKEN INTO RangeToken
token = RangeToken("%d-%d" % (token.value, token.value))
if isinstance(token, RangeToken):
if current_range:
groups.append({
'byterange': current_range,
'name': current_name,
'length': current_length,
'desc': current_desc,
})
# UPDATE RANGE POSITION
byte_pos = token.end_position + 1
current_range = token
current_name = []
current_length = None
current_desc = []
state = 'name'
elif state == 'name':
if isinstance(token, StringToken) and current_name and isinstance(current_name[-1], NumericToken):
current_length = current_name.pop()
state = 'desc'
else:
current_name.append(token)
if state == 'desc':
current_desc.append(token)
results = groups
return results return results
def _compile(self, groups):
for g in groups:
assert g['byterange'].value == g['length'].value
name = u'_'.join(map(lambda x:x.value.lower(), g['name']))
name = re.sub(r'[^\w]','', name)
yield({
'name': name,
'desc': u' '.join(map(lambda x:unicode(x.value), g['desc'])),
'length': g['byterange'].value,
})
sdp = SimpleDefParser() sdp = SimpleDefParser()
tokens = sdp.load([ tokens = sdp.load([
@ -97,6 +194,58 @@ tokens = sdp.load([
pdp = PastedDefParser() pdp = PastedDefParser()
tokens2 = pdp.load(""" tokens2 = pdp.load("""
544 Second TIN
Notice
(Optional)
1 Enter 2 (two) to indicate notification by IRS twice within
three calendar years that the payee provided an incorrect name
and/or TIN combination; otherwise, enter a blank.
545-546 Blank 2 Enter blanks.
547-586 Foreign Country
or U.S.
Possession
40 Enter the name of the foreign country or U.S. possession to
which the withheld foreign tax (Amount Code 6) applies.
Otherwise, enter blanks.
587-599 CUSIP Number 13 Enter CUSIP Number. If the tax-exempt interest is reported in
the aggregate for multiple bonds or accounts, enter: VARIOUS.
Right-justify information and fill unused positions with blanks l.
600-662 Blank 63 Enter blanks.
663-722 Special Data
Entries
60 This portion of the B Record may be used to record
information for state or local government reporting or for the
filer's own purposes. Payers should contact the state or local
revenue departments for filing requirements. You may enter
your routing and transit number (RTN) here. If this field is not
utilized, enter blanks.
103-114 Payment 103-114 Payment
Amount 5* Amount 5*
@ -127,3 +276,43 @@ Amount 8*
12 The amount reported in this field represents payments f 12 The amount reported in this field represents payments f
""") """)
tokens3 = pdp.load("""
544-546 Blank 3 Enter blanks.
547 Trade or
Business
Indicator
1 Enter 1 (one) to indicate the state or local income tax refund,
credit, or offset (Amount Code 2) is attributable to income tax
that applies exclusively to income from a trade or business.
Indicator Usage
1 Income tax refund applies exclusively to a trade or
business.
Blank Income tax refund is a general tax refund.
548-551 Tax Year of
Refund
4 Enter the tax year for which the refund, credit, or
offset (Amount Code 2) was issued. The tax year must reflect
the tax year for which the refund was made, not the tax year
of Form 1099-G. The tax year must be in the four-position
format of YYYY (e.g., 2010). The valid range of years for the
refund is 2001 through 2010.
""")