pyaccuwage/pyaccuwage/parser.py

#!/usr/bin/python
# coding=UTF-8
"""
Parser utility to read data from Publication 1220 and
convert it into python classes.

"""
import re


class SimpleDefParser(object):
    def __init__(self):
        pass

    def load(self, infile):
        for row in infile:
            tokens = tuple(self._tokenize(row))
            yield tokens

    def _intify(self, x):
        try:
            x = int(x.strip())
        except (ValueError):
            pass
        return x


    def _tokenize(self, row):
        for item in row.split(','):
            item = item.strip()
            if ' ' in item:
                item = item.replace(' ','_')
            else:
                item = item.upper()

            if '-' in item:
                parts = map(lambda x:self._intify(x), item.split('-'))
                item = reduce(lambda x,y: y-x, parts)
            else:
                item = self._intify(item)
            yield item


class LengthExpression(object):
    import operator
    REG = re.compile(r'([^\d]*)(\d+)')
    OPS = {
        '<': operator.lt,
        '>': operator.gt,
        '<=': operator.le,
        '>=': operator.ge,
        '=': operator.eq,
    }

    def __init__(self):
        self.exp_cache = {}

    def __call__(self, value, exps):
        return len(exps) == sum(map(lambda x: self.check(value, x), exps))

    def compile_exp(self, exp):
        op, val = self.REG.match(exp).groups()
        val = int(val)
        return (self.OPS.get(op, None), val)

    def check(self, value, exp):
        if exp not in self.exp_cache:
            (op, opval) = self.exp_cache[exp] = self.compile_exp(exp)
        else:
            (op, opval) = self.exp_cache[exp]
        return op(value, opval)


class BaseToken(object):
    regexp = re.compile('(.*)')

    def __init__(self, value):
        self._value = value

    def match(self, value):
        return self.regexp

    def __repr__(self):
        return ",".join([str(self.__class__), self._value])


class StringToken(BaseToken):
    regexp = re.compile('(.*)')

    @property
    def value(self):
        return self._value.decode('ascii','ignore')

class RangeToken(BaseToken):
    regexp = re.compile('(\d+)-(\d+)')

    @property
    def value(self):
        if '-' not in self._value:
            return 1
        return reduce(lambda x,y: y-x, map(int, self._value.split('-')))+1

    @property
    def end_position(self):
        return int(self._value.split('-')[1])

    @property
    def start_position(self):
        return int(self._value.split('-')[0])


class NumericToken(BaseToken):
    regexp = re.compile('^(\d+)$')

    @property
    def value(self):
        return int(self._value)


class RecordBuilder(object):
    import fields

    entry_max_length = 4

    TOKEN_TYPES = [
        RangeToken,
        NumericToken,
        StringToken,
    ]

    FIELD_TYPES = [
        (fields.BlankField, {
            'regexp': {
                'name': [
                    re.compile(r'^blank$'),
                ],
            },
        }),

        (fields.MoneyField, {
            'regexp': {
                'desc': [
                    re.compile(r'right\-justified'),
                    re.compile(r'amount'),
                    re.compile(r'zero\-filled'),
                ],
            },
        }),

        (fields.TextField, {
            'regexp': {
                'desc': [
                    re.compile(r'enter blanks'),
                ],
            },
        }),

        (fields.StateField, {
            'regexp': {
                'desc': [
                    re.compile(r'state'),
                    re.compile(r'postal'),
                ],
            },
            'length': ['=2'],
        })
    ]

    def load(self, entries):
        # EXPECTS ENTRIES TO BE IN THE FORM OF
        # [('1-5', 'Field Name', '5', 'Description of field.'),...]
        entries = self._compile(entries)
        entries = self._guess_field_types(entries)
        entries = self._convert_to_records(entries)
        return entries


    def _compile(self, entries):
        for entry in entries:

            if len(entry) > self.entry_max_length:
                continue

            (f_range, f_name, f_length, f_desc) = list(entry) + ['']*(self.entry_max_length-len(entry))

            try:
                f_length = int(f_length)
            except ValueError, e:
                # bad result, skip
                continue

            try:
                assert f_length == RangeToken(f_range).value
            except AssertionError, e:
                continue
            except ValueError, e:
                # bad result, skip
                continue

            name_parts = f_name.split(' ')

            if name_parts[-1].lower() == '(optional)':
                name_parts = name_parts[0:-1]
                required = False
            elif re.search('required', f_desc, re.IGNORECASE):
                required = True
            else:
                required = None

            f_name = u'_'.join(map(lambda x:x.lower(), name_parts))
            f_name = f_name.replace('&', 'and')
            f_name = re.sub(r'[^\w]','', f_name)

            yield {
                'name': f_name,
                'range': f_range,
                'desc': f_desc,
                'length': f_length,
                'required': required,
            }


    def _guess_field_types(self, entries):
        lengthexp = LengthExpression()

        for entry in entries:
            matches = dict(map(lambda x:(x[0],0), self.FIELD_TYPES))

            for (classtype, criteria) in self.FIELD_TYPES:
                if 'length' in criteria:
                    if not lengthexp(int(entry['length']), criteria['length']):
                        continue

                if 'regexp' in criteria:
                    for crit_key, crit_values in criteria['regexp'].items():
                        for crit_re in crit_values:
                            matches[classtype] += 1 if crit_re.search(entry[crit_key]) else 0


            matches = list(matches.items())
            matches.sort(key=lambda x:x[1])

            matches_found = True if sum(map(lambda x:x[1], matches)) > 0 else False

            entry['guessed_type'] = matches[-1][0] if matches_found else self.fields.TextField
            yield entry

    def _convert_to_records(self, entries):
        blank_count = 1
        for entry in entries:
            result = []
            add = result.append

            # FIELD NAME
            if entry['name'] == 'blank':
                add( (u'blank%d' % blank_count).ljust(40) )
                blank_count += 1
            else:
                add(entry['name'].ljust(40))

            add(' = ')

            if entry['guessed_type']:
                add(entry['guessed_type'].__name__)

                args = []
                args.append("max_length=%d" % entry['length'])
                if entry['required'] != None:
                    args.append("required=%s" % ('True' if entry['required'] else 'False'))

                add("(" + ", ".join(args) + ")")

            yield "".join(result).ljust(85) + "# %s" % entry['range']


class PastedDefParser(RecordBuilder):

    def load(self, infile):
        tokens = self._tokenize(infile)
        entries = self._parse(tokens)
        entries = self._compile(entries)
        entries = self._guess_field_types(entries)
        entries = self._convert_to_records(entries)
        return entries

    def _tokenize(self, data):
        for item in data.replace('\n',' ').split(' '):
            item = item.strip()
            if len(item) == 0:
                continue
            for tclass in self.TOKEN_TYPES:
                if tclass.regexp.match(item):
                    yield tclass(item)
                    break

        yield None

    def _parse(self, tokens):
        # TODO group things based on strides between RangeTokens, probably
        # starting with range token, then the following BaseTokens are likely
        # the field name, followed by a NumericToken, then Base/Numeric tokens
        # for the field's description, until then next RangeToken is found.
        groups = []
        current_range = None
        current_name = []
        current_length = None
        current_desc = []
        state = 'range'

        byte_pos = None

        # COLLECT TOKENS INTO GROUPS
        for token in tokens:
            if isinstance(token, NumericToken):
                # THIS MAY BE A RANGE, IF THE VALUE MATCHES THE CURRENT byte_pos
                # THEN WE ASSUME THIS TO BE A SINGLE BYTE RANGE DEFINITION,
                # AND WE WILL TREAT IT AS A NEW RECORD.
                if byte_pos == None or token.value == byte_pos:
                    # UPDATE RANGE POSITION
                    byte_pos = token.value + 1

                    # CONVERT TOKEN INTO RangeToken
                    token = RangeToken("%d-%d" % (token.value, token.value))


            if isinstance(token, RangeToken) or token == None:
                # IF THIS RANGETOKEN IS NOT THE BEGINNING OF A NEW
                # FIELD, THEN ITS RANGES WILL NOT MATCH THE PREVIOUS
                # byte_pos AND IS ASSUMED TO BE PART OF THE DESCRIPTION.
                #if byte_pos and token and state == 'desc' and token.start_position != byte_pos:
                #    print token.start_position, byte_pos
                #    current_desc.append(token)

                if token and byte_pos and token.start_position != byte_pos:
                    state = 'desc'

                else:
                    if current_range:
                        groups.append({
                            'byterange': current_range,
                            'name': current_name,
                            'length': current_length,
                            'desc': current_desc,
                        })

                    # UPDATE RANGE POSITION
                    if token:
                        byte_pos = token.end_position + 1

                    current_range = token
                    current_name = []
                    current_length = None
                    current_desc = []
                    state = 'name'

            elif state == 'name':
                if isinstance(token, StringToken) and current_name and isinstance(current_name[-1], NumericToken):
                    current_length = current_name.pop()
                    state = 'desc'
                else:
                    current_name.append(token)
            if state == 'desc':
                current_desc.append(token)

        results = groups
        return results

    def _compile(self, groups):
        for g in groups:
            assert g['byterange'].value == g['length'].value

            desc = u' '.join(map(lambda x:unicode(x.value), g['desc']))

            if g['name'][-1].value.lower() == '(optional)':
                g['name'] = g['name'][0:-1]
                required = False
            elif re.search('required', desc, re.IGNORECASE):
                required = True
            else:
                required = None

            name = u'_'.join(map(lambda x:x.value.lower(), g['name']))
            name = re.sub(r'[^\w]','', name)

            yield({
                'name': name,
                'desc':  desc,
                'length': g['byterange'].value,
                'required': required,
            })


    """
    def _guess_field_types(self, entries):
        lengthexp = LengthExpression()

        for entry in entries:
            matches = dict(map(lambda x:(x[0],0), self.FIELD_TYPES))

            for (classtype, criteria) in self.FIELD_TYPES:
                if 'length' in criteria:
                    if not lengthexp(int(entry['length']), criteria['length']):
                        continue

                if 'regexp' in criteria:
                    for crit_key, crit_values in criteria['regexp'].items():
                        for crit_re in crit_values:
                            matches[classtype] += 1 if crit_re.search(entry[crit_key]) else 0


            matches = list(matches.items())
            matches.sort(key=lambda x:x[1])

            matches_found = True if sum(map(lambda x:x[1], matches)) > 0 else False

            entry['guessed_type'] = matches[-1][0] if matches_found else self.fields.TextField
            yield entry

    """
    """
    def _convert_to_records(self, entries):
        blank_count = 1
        for entry in entries:
            result = []
            add = result.append

            # FIELD NAME
            if entry['name'] == 'blank':
                add( (u'blank%d' % blank_count).ljust(40) )
                blank_count += 1
            else:
                add(entry['name'].ljust(40))

            add(' = ')

            if entry['guessed_type']:
                add(entry['guessed_type'].__name__)

                args = []
                args.append("max_length=%d" % entry['length'])
                if entry['required'] != None:
                    args.append("required=%s" % ('True' if entry['required'] else 'False'))

                add("(" + ", ".join(args) + ")")


            yield "".join(result)

    """