Parser is mostly working, there's an issue with the last grouping of tokens
not being parsed. This can probably fixed by yielding an end-marker from the tokenizer generator so the compiler knows to clear out the last item.
This commit is contained in:
parent
6e9b8041b9
commit
027b44b65c
1 changed files with 196 additions and 7 deletions
|
@ -41,31 +41,53 @@ class BaseToken(object):
|
|||
regexp = re.compile('(.*)')
|
||||
|
||||
def __init__(self, value):
|
||||
self.value = value
|
||||
self._value = value
|
||||
|
||||
def match(self, value):
|
||||
return self.regexp
|
||||
|
||||
def __repr__(self):
|
||||
return ",".join([str(self.__class__), self.value])
|
||||
return ",".join([str(self.__class__), self._value])
|
||||
|
||||
|
||||
class StringToken(BaseToken):
|
||||
regexp = re.compile('(.*)')
|
||||
|
||||
@property
|
||||
def value(self):
|
||||
return self._value.decode('ascii','ignore')
|
||||
|
||||
class RangeToken(BaseToken):
|
||||
regexp = re.compile('(\d+)-(\d+)')
|
||||
|
||||
|
||||
@property
|
||||
def value(self):
|
||||
return reduce(lambda x,y: y-x, map(int, self._value.split('-')))+1
|
||||
|
||||
@property
|
||||
def end_position(self):
|
||||
return int(self._value.split('-')[1])
|
||||
|
||||
class NumericToken(BaseToken):
|
||||
regexp = re.compile('(\d+)')
|
||||
regexp = re.compile('^(\d+)$')
|
||||
|
||||
@property
|
||||
def value(self):
|
||||
return int(self._value)
|
||||
|
||||
|
||||
|
||||
class PastedDefParser(object):
|
||||
TOKEN_TYPES = [
|
||||
RangeToken,
|
||||
NumericToken,
|
||||
BaseToken,
|
||||
StringToken,
|
||||
]
|
||||
|
||||
def load(self, infile):
|
||||
tokens = self._tokenize(infile)
|
||||
entries = self._parse(tokens)
|
||||
#entries = self._compile(entries)
|
||||
return entries
|
||||
|
||||
def _tokenize(self, data):
|
||||
|
@ -82,10 +104,85 @@ class PastedDefParser(object):
|
|||
# TODO group things based on strides between RangeTokens, probably
|
||||
# starting with range token, then the following BaseTokens are likely
|
||||
# the field name, followed by a NumericToken, then Base/Numeric tokens
|
||||
# for the field's description, until then ext RangeToken is found.
|
||||
results = tokens
|
||||
# for the field's description, until then next RangeToken is found.
|
||||
groups = []
|
||||
current_range = None
|
||||
current_name = []
|
||||
current_length = None
|
||||
current_desc = []
|
||||
state = 'range'
|
||||
|
||||
byte_pos = None
|
||||
|
||||
# COLLECT TOKENS INTO GROUPS
|
||||
for token in tokens:
|
||||
"""
|
||||
if byte_pos == None:
|
||||
# START. IF byte_pos IS NONE, WE ASSUME WE'RE STARTING WITH
|
||||
# A NEW SERIES OF FIELDS. THE FIRST VALUE WHICH IS A RANGE
|
||||
# OR A NUMERIC FIELD IS ASSUMED TO BE THE FIELD POSITION/RANGE.
|
||||
|
||||
if isinstance(token, RangeToken):
|
||||
byte_pos = token.end_position + 1
|
||||
elif isinstance(token, NumericToken):
|
||||
byte_pos = token.value + 1
|
||||
"""
|
||||
if isinstance(token, NumericToken):
|
||||
# THIS MAY BE A RANGE, IF THE VALUE MATCHES THE CURRENT byte_pos
|
||||
# THEN WE ASSUME THIS TO BE A SINGLE BYTE RANGE DEFINITION,
|
||||
# AND WE WILL TREAT IT AS A NEW RECORD.
|
||||
if byte_pos == None or token.value == byte_pos:
|
||||
# UPDATE RANGE POSITION
|
||||
byte_pos = token.value + 1
|
||||
|
||||
# CONVERT TOKEN INTO RangeToken
|
||||
token = RangeToken("%d-%d" % (token.value, token.value))
|
||||
|
||||
|
||||
if isinstance(token, RangeToken):
|
||||
if current_range:
|
||||
groups.append({
|
||||
'byterange': current_range,
|
||||
'name': current_name,
|
||||
'length': current_length,
|
||||
'desc': current_desc,
|
||||
})
|
||||
|
||||
# UPDATE RANGE POSITION
|
||||
byte_pos = token.end_position + 1
|
||||
|
||||
current_range = token
|
||||
current_name = []
|
||||
current_length = None
|
||||
current_desc = []
|
||||
state = 'name'
|
||||
|
||||
|
||||
elif state == 'name':
|
||||
if isinstance(token, StringToken) and current_name and isinstance(current_name[-1], NumericToken):
|
||||
current_length = current_name.pop()
|
||||
state = 'desc'
|
||||
else:
|
||||
current_name.append(token)
|
||||
if state == 'desc':
|
||||
current_desc.append(token)
|
||||
|
||||
results = groups
|
||||
return results
|
||||
|
||||
def _compile(self, groups):
|
||||
for g in groups:
|
||||
assert g['byterange'].value == g['length'].value
|
||||
|
||||
name = u'_'.join(map(lambda x:x.value.lower(), g['name']))
|
||||
name = re.sub(r'[^\w]','', name)
|
||||
|
||||
yield({
|
||||
'name': name,
|
||||
'desc': u' '.join(map(lambda x:unicode(x.value), g['desc'])),
|
||||
'length': g['byterange'].value,
|
||||
})
|
||||
|
||||
|
||||
sdp = SimpleDefParser()
|
||||
tokens = sdp.load([
|
||||
|
@ -97,6 +194,58 @@ tokens = sdp.load([
|
|||
|
||||
pdp = PastedDefParser()
|
||||
tokens2 = pdp.load("""
|
||||
544 Second TIN
|
||||
|
||||
Notice
|
||||
|
||||
(Optional)
|
||||
|
||||
1 Enter “2” (two) to indicate notification by IRS twice within
|
||||
|
||||
three calendar years that the payee provided an incorrect name
|
||||
|
||||
and/or TIN combination; otherwise, enter a blank.
|
||||
|
||||
545-546 Blank 2 Enter blanks.
|
||||
|
||||
547-586 Foreign Country
|
||||
|
||||
or U.S.
|
||||
|
||||
Possession
|
||||
|
||||
40 Enter the name of the foreign country or U.S. possession to
|
||||
|
||||
which the withheld foreign tax (Amount Code 6) applies.
|
||||
|
||||
Otherwise, enter blanks.
|
||||
|
||||
587-599 CUSIP Number 13 Enter CUSIP Number. If the tax-exempt interest is reported in
|
||||
|
||||
the aggregate for multiple bonds or accounts, enter: VARIOUS.
|
||||
|
||||
Right-justify information and fill unused positions with blanks l.
|
||||
|
||||
600-662 Blank 63 Enter blanks.
|
||||
|
||||
663-722 Special Data
|
||||
|
||||
Entries
|
||||
|
||||
60 This portion of the “B” Record may be used to record
|
||||
|
||||
information for state or local government reporting or for the
|
||||
|
||||
filer's own purposes. Payers should contact the state or local
|
||||
|
||||
revenue departments for filing requirements. You may enter
|
||||
|
||||
your routing and transit number (RTN) here. If this field is not
|
||||
|
||||
utilized, enter blanks.
|
||||
|
||||
|
||||
|
||||
103-114 Payment
|
||||
|
||||
Amount 5*
|
||||
|
@ -127,3 +276,43 @@ Amount 8*
|
|||
|
||||
12 The amount reported in this field represents payments f
|
||||
""")
|
||||
|
||||
tokens3 = pdp.load("""
|
||||
544-546 Blank 3 Enter blanks.
|
||||
|
||||
547 Trade or
|
||||
|
||||
Business
|
||||
|
||||
Indicator
|
||||
|
||||
1 Enter “1” (one) to indicate the state or local income tax refund,
|
||||
|
||||
credit, or offset (Amount Code 2) is attributable to income tax
|
||||
|
||||
that applies exclusively to income from a trade or business.
|
||||
|
||||
Indicator Usage
|
||||
|
||||
1 Income tax refund applies exclusively to a trade or
|
||||
|
||||
business.
|
||||
|
||||
Blank Income tax refund is a general tax refund.
|
||||
|
||||
548-551 Tax Year of
|
||||
|
||||
Refund
|
||||
|
||||
4 Enter the tax year for which the refund, credit, or
|
||||
|
||||
offset (Amount Code 2) was issued. The tax year must reflect
|
||||
|
||||
the tax year for which the refund was made, not the tax year
|
||||
|
||||
of Form 1099-G. The tax year must be in the four-position
|
||||
|
||||
format of YYYY (e.g., 2010). The valid range of years for the
|
||||
|
||||
refund is 2001 through 2010.
|
||||
""")
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue