Parser is mostly working, there's an issue with the last grouping of tokens
not being parsed. This can probably fixed by yielding an end-marker from the tokenizer generator so the compiler knows to clear out the last item.
This commit is contained in:
parent
6e9b8041b9
commit
027b44b65c
1 changed files with 196 additions and 7 deletions
|
@ -41,31 +41,53 @@ class BaseToken(object):
|
||||||
regexp = re.compile('(.*)')
|
regexp = re.compile('(.*)')
|
||||||
|
|
||||||
def __init__(self, value):
|
def __init__(self, value):
|
||||||
self.value = value
|
self._value = value
|
||||||
|
|
||||||
def match(self, value):
|
def match(self, value):
|
||||||
return self.regexp
|
return self.regexp
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return ",".join([str(self.__class__), self.value])
|
return ",".join([str(self.__class__), self._value])
|
||||||
|
|
||||||
|
|
||||||
|
class StringToken(BaseToken):
|
||||||
|
regexp = re.compile('(.*)')
|
||||||
|
|
||||||
|
@property
|
||||||
|
def value(self):
|
||||||
|
return self._value.decode('ascii','ignore')
|
||||||
|
|
||||||
class RangeToken(BaseToken):
|
class RangeToken(BaseToken):
|
||||||
regexp = re.compile('(\d+)-(\d+)')
|
regexp = re.compile('(\d+)-(\d+)')
|
||||||
|
|
||||||
|
@property
|
||||||
|
def value(self):
|
||||||
|
return reduce(lambda x,y: y-x, map(int, self._value.split('-')))+1
|
||||||
|
|
||||||
|
@property
|
||||||
|
def end_position(self):
|
||||||
|
return int(self._value.split('-')[1])
|
||||||
|
|
||||||
class NumericToken(BaseToken):
|
class NumericToken(BaseToken):
|
||||||
regexp = re.compile('(\d+)')
|
regexp = re.compile('^(\d+)$')
|
||||||
|
|
||||||
|
@property
|
||||||
|
def value(self):
|
||||||
|
return int(self._value)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class PastedDefParser(object):
|
class PastedDefParser(object):
|
||||||
TOKEN_TYPES = [
|
TOKEN_TYPES = [
|
||||||
RangeToken,
|
RangeToken,
|
||||||
NumericToken,
|
NumericToken,
|
||||||
BaseToken,
|
StringToken,
|
||||||
]
|
]
|
||||||
|
|
||||||
def load(self, infile):
|
def load(self, infile):
|
||||||
tokens = self._tokenize(infile)
|
tokens = self._tokenize(infile)
|
||||||
entries = self._parse(tokens)
|
entries = self._parse(tokens)
|
||||||
|
#entries = self._compile(entries)
|
||||||
return entries
|
return entries
|
||||||
|
|
||||||
def _tokenize(self, data):
|
def _tokenize(self, data):
|
||||||
|
@ -82,10 +104,85 @@ class PastedDefParser(object):
|
||||||
# TODO group things based on strides between RangeTokens, probably
|
# TODO group things based on strides between RangeTokens, probably
|
||||||
# starting with range token, then the following BaseTokens are likely
|
# starting with range token, then the following BaseTokens are likely
|
||||||
# the field name, followed by a NumericToken, then Base/Numeric tokens
|
# the field name, followed by a NumericToken, then Base/Numeric tokens
|
||||||
# for the field's description, until then ext RangeToken is found.
|
# for the field's description, until then next RangeToken is found.
|
||||||
results = tokens
|
groups = []
|
||||||
|
current_range = None
|
||||||
|
current_name = []
|
||||||
|
current_length = None
|
||||||
|
current_desc = []
|
||||||
|
state = 'range'
|
||||||
|
|
||||||
|
byte_pos = None
|
||||||
|
|
||||||
|
# COLLECT TOKENS INTO GROUPS
|
||||||
|
for token in tokens:
|
||||||
|
"""
|
||||||
|
if byte_pos == None:
|
||||||
|
# START. IF byte_pos IS NONE, WE ASSUME WE'RE STARTING WITH
|
||||||
|
# A NEW SERIES OF FIELDS. THE FIRST VALUE WHICH IS A RANGE
|
||||||
|
# OR A NUMERIC FIELD IS ASSUMED TO BE THE FIELD POSITION/RANGE.
|
||||||
|
|
||||||
|
if isinstance(token, RangeToken):
|
||||||
|
byte_pos = token.end_position + 1
|
||||||
|
elif isinstance(token, NumericToken):
|
||||||
|
byte_pos = token.value + 1
|
||||||
|
"""
|
||||||
|
if isinstance(token, NumericToken):
|
||||||
|
# THIS MAY BE A RANGE, IF THE VALUE MATCHES THE CURRENT byte_pos
|
||||||
|
# THEN WE ASSUME THIS TO BE A SINGLE BYTE RANGE DEFINITION,
|
||||||
|
# AND WE WILL TREAT IT AS A NEW RECORD.
|
||||||
|
if byte_pos == None or token.value == byte_pos:
|
||||||
|
# UPDATE RANGE POSITION
|
||||||
|
byte_pos = token.value + 1
|
||||||
|
|
||||||
|
# CONVERT TOKEN INTO RangeToken
|
||||||
|
token = RangeToken("%d-%d" % (token.value, token.value))
|
||||||
|
|
||||||
|
|
||||||
|
if isinstance(token, RangeToken):
|
||||||
|
if current_range:
|
||||||
|
groups.append({
|
||||||
|
'byterange': current_range,
|
||||||
|
'name': current_name,
|
||||||
|
'length': current_length,
|
||||||
|
'desc': current_desc,
|
||||||
|
})
|
||||||
|
|
||||||
|
# UPDATE RANGE POSITION
|
||||||
|
byte_pos = token.end_position + 1
|
||||||
|
|
||||||
|
current_range = token
|
||||||
|
current_name = []
|
||||||
|
current_length = None
|
||||||
|
current_desc = []
|
||||||
|
state = 'name'
|
||||||
|
|
||||||
|
|
||||||
|
elif state == 'name':
|
||||||
|
if isinstance(token, StringToken) and current_name and isinstance(current_name[-1], NumericToken):
|
||||||
|
current_length = current_name.pop()
|
||||||
|
state = 'desc'
|
||||||
|
else:
|
||||||
|
current_name.append(token)
|
||||||
|
if state == 'desc':
|
||||||
|
current_desc.append(token)
|
||||||
|
|
||||||
|
results = groups
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
def _compile(self, groups):
|
||||||
|
for g in groups:
|
||||||
|
assert g['byterange'].value == g['length'].value
|
||||||
|
|
||||||
|
name = u'_'.join(map(lambda x:x.value.lower(), g['name']))
|
||||||
|
name = re.sub(r'[^\w]','', name)
|
||||||
|
|
||||||
|
yield({
|
||||||
|
'name': name,
|
||||||
|
'desc': u' '.join(map(lambda x:unicode(x.value), g['desc'])),
|
||||||
|
'length': g['byterange'].value,
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
sdp = SimpleDefParser()
|
sdp = SimpleDefParser()
|
||||||
tokens = sdp.load([
|
tokens = sdp.load([
|
||||||
|
@ -97,6 +194,58 @@ tokens = sdp.load([
|
||||||
|
|
||||||
pdp = PastedDefParser()
|
pdp = PastedDefParser()
|
||||||
tokens2 = pdp.load("""
|
tokens2 = pdp.load("""
|
||||||
|
544 Second TIN
|
||||||
|
|
||||||
|
Notice
|
||||||
|
|
||||||
|
(Optional)
|
||||||
|
|
||||||
|
1 Enter “2” (two) to indicate notification by IRS twice within
|
||||||
|
|
||||||
|
three calendar years that the payee provided an incorrect name
|
||||||
|
|
||||||
|
and/or TIN combination; otherwise, enter a blank.
|
||||||
|
|
||||||
|
545-546 Blank 2 Enter blanks.
|
||||||
|
|
||||||
|
547-586 Foreign Country
|
||||||
|
|
||||||
|
or U.S.
|
||||||
|
|
||||||
|
Possession
|
||||||
|
|
||||||
|
40 Enter the name of the foreign country or U.S. possession to
|
||||||
|
|
||||||
|
which the withheld foreign tax (Amount Code 6) applies.
|
||||||
|
|
||||||
|
Otherwise, enter blanks.
|
||||||
|
|
||||||
|
587-599 CUSIP Number 13 Enter CUSIP Number. If the tax-exempt interest is reported in
|
||||||
|
|
||||||
|
the aggregate for multiple bonds or accounts, enter: VARIOUS.
|
||||||
|
|
||||||
|
Right-justify information and fill unused positions with blanks l.
|
||||||
|
|
||||||
|
600-662 Blank 63 Enter blanks.
|
||||||
|
|
||||||
|
663-722 Special Data
|
||||||
|
|
||||||
|
Entries
|
||||||
|
|
||||||
|
60 This portion of the “B” Record may be used to record
|
||||||
|
|
||||||
|
information for state or local government reporting or for the
|
||||||
|
|
||||||
|
filer's own purposes. Payers should contact the state or local
|
||||||
|
|
||||||
|
revenue departments for filing requirements. You may enter
|
||||||
|
|
||||||
|
your routing and transit number (RTN) here. If this field is not
|
||||||
|
|
||||||
|
utilized, enter blanks.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
103-114 Payment
|
103-114 Payment
|
||||||
|
|
||||||
Amount 5*
|
Amount 5*
|
||||||
|
@ -127,3 +276,43 @@ Amount 8*
|
||||||
|
|
||||||
12 The amount reported in this field represents payments f
|
12 The amount reported in this field represents payments f
|
||||||
""")
|
""")
|
||||||
|
|
||||||
|
tokens3 = pdp.load("""
|
||||||
|
544-546 Blank 3 Enter blanks.
|
||||||
|
|
||||||
|
547 Trade or
|
||||||
|
|
||||||
|
Business
|
||||||
|
|
||||||
|
Indicator
|
||||||
|
|
||||||
|
1 Enter “1” (one) to indicate the state or local income tax refund,
|
||||||
|
|
||||||
|
credit, or offset (Amount Code 2) is attributable to income tax
|
||||||
|
|
||||||
|
that applies exclusively to income from a trade or business.
|
||||||
|
|
||||||
|
Indicator Usage
|
||||||
|
|
||||||
|
1 Income tax refund applies exclusively to a trade or
|
||||||
|
|
||||||
|
business.
|
||||||
|
|
||||||
|
Blank Income tax refund is a general tax refund.
|
||||||
|
|
||||||
|
548-551 Tax Year of
|
||||||
|
|
||||||
|
Refund
|
||||||
|
|
||||||
|
4 Enter the tax year for which the refund, credit, or
|
||||||
|
|
||||||
|
offset (Amount Code 2) was issued. The tax year must reflect
|
||||||
|
|
||||||
|
the tax year for which the refund was made, not the tax year
|
||||||
|
|
||||||
|
of Form 1099-G. The tax year must be in the four-position
|
||||||
|
|
||||||
|
format of YYYY (e.g., 2010). The valid range of years for the
|
||||||
|
|
||||||
|
refund is 2001 through 2010.
|
||||||
|
""")
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue