From b40e736ae08bcb96d1a934a98cde1a0a3033b55a Mon Sep 17 00:00:00 2001 From: Binh Nguyen Date: Tue, 19 Feb 2013 15:55:05 -0600 Subject: [PATCH] bumping version, improving field type guessing --- pyaccuwage/fields.py | 34 +++++++++++++++++----------------- pyaccuwage/parser.py | 36 ++++++++++++++++++++++++++---------- scripts/pyaccuwage-pdfparse | 6 +++++- setup.py | 2 +- 4 files changed, 49 insertions(+), 29 deletions(-) diff --git a/pyaccuwage/fields.py b/pyaccuwage/fields.py index 9e38448..960db3e 100644 --- a/pyaccuwage/fields.py +++ b/pyaccuwage/fields.py @@ -12,7 +12,7 @@ class ValidationError(Exception): return "(%s.%s) %s" % (self.field.parent_name, self.field.name, self.msg) else: return repr(self.msg) - + class Field(object): creation_counter = 0 @@ -31,13 +31,13 @@ class Field(object): def get_data(self): raise NotImplemented - + def __setvalue(self, value): self._value = value - def __getvalue(self): + def __getvalue(self): return self._value - + value = property(__getvalue, __setvalue) def read(self, fp): @@ -50,7 +50,7 @@ class Field(object): self.value = s.strip() -class TextField(Field): +class TextField(Field): def validate(self): if self.value == None and self.required: raise ValidationError("value required", field=self) @@ -65,7 +65,7 @@ class TextField(Field): class StateField(TextField): - def __init__(self, name=None, required=True, use_numeric=False): + def __init__(self, name=None, required=True, use_numeric=False, max_length=2): super(StateField, self).__init__(name=name, max_length=2, required=required) self.use_numeric = use_numeric @@ -90,7 +90,7 @@ class StateField(TextField): class EmailField(TextField): def __init__(self, name=None, required=True, max_length=None): - return super(EmailField, self).__init__(name=name, max_length=max_length, + return super(EmailField, self).__init__(name=name, max_length=max_length, required=required, uppercase=False) class IntegerField(TextField): @@ -101,7 +101,7 @@ class IntegerField(TextField): int(self.value) except ValueError: raise ValidationError("field contains non-numeric characters", field=self) - + def get_data(self): value = self.value or "" @@ -123,7 +123,7 @@ class StaticField(TextField): class BlankField(TextField): def __init__(self, name=None, max_length=0, required=False): super(TextField, self).__init__(name=name, max_length=max_length, required=required, uppercase=False) - + def get_data(self): return " " * self.max_length @@ -161,7 +161,7 @@ class MoneyField(Field): class DateField(TextField): def __init__(self, name=None, required=True, value=None): - super(TextField, self).__init__(name=name, required=required, max_length=8) + super(TextField, self).__init__(name=name, required=required, max_length=8) if value: self.value = value @@ -169,7 +169,7 @@ class DateField(TextField): if self._value: return self._value.strftime('%m%d%Y') return '0' * self.max_length - + def parse(self, s): if int(s) > 0: self.value = datetime.date(*[int(x) for x in s[4:8], s[0:2], s[2:4]]) @@ -184,9 +184,9 @@ class DateField(TextField): else: self._value = None - def __getvalue(self): + def __getvalue(self): return self._value - + value = property(__getvalue, __setvalue) @@ -196,7 +196,7 @@ class MonthYearField(TextField): if value: self.value = value - + def get_data(self): if self._value: return self._value.strftime("%m%Y") @@ -215,9 +215,9 @@ class MonthYearField(TextField): self._value = datetime.date(*[int(x) for x in value[2:6], value[0:2], 1]) else: self._value = None - - def __getvalue(self): + + def __getvalue(self): return self._value - + value = property(__getvalue, __setvalue) diff --git a/pyaccuwage/parser.py b/pyaccuwage/parser.py index 1b475ca..b367db1 100644 --- a/pyaccuwage/parser.py +++ b/pyaccuwage/parser.py @@ -132,7 +132,7 @@ class RecordBuilder(object): (fields.BlankField, { 'regexp': { 'name': [ - re.compile(r'^blank$'), + (re.compile(r'^blank$', re.IGNORECASE), +1), ], }, }), @@ -140,9 +140,13 @@ class RecordBuilder(object): (fields.MoneyField, { 'regexp': { 'desc': [ - re.compile(r'right\-justified'), - re.compile(r'amount'), - re.compile(r'zero\-filled'), + (re.compile(r'right\-justif', re.IGNORECASE), +1), + (re.compile(r'amount', re.IGNORECASE), +1), + (re.compile(r'zero\-filled', re.IGNORECASE), +1), + (re.compile(r'leading zeroes', re.IGNORECASE), +1), + + (re.compile(r'left-\justif', re.IGNORECASE), -1), + ], }, }), @@ -150,7 +154,7 @@ class RecordBuilder(object): (fields.TextField, { 'regexp': { 'desc': [ - re.compile(r'enter blanks'), + (re.compile(r'blanks', re.IGNORECASE), +1), ], }, }), @@ -158,12 +162,24 @@ class RecordBuilder(object): (fields.StateField, { 'regexp': { 'desc': [ - re.compile(r'state'), - re.compile(r'postal'), + (re.compile(r'state', re.IGNORECASE), +1), + (re.compile(r'postal', re.IGNORECASE), +1), ], }, 'length': ['=2'], - }) + }), + + (fields.IntegerField, { + 'regexp': { + 'desc': [ + (re.compile(r'right\-justif', re.IGNORECASE), +1), + (re.compile(r'leading zeroes', re.IGNORECASE), +1), + (re.compile(r'number', re.IGNORECASE), +1), + + (re.compile(r'left\-justif', re.IGNORECASE), -1), + ], + }, + }), ] def load(self, entries): @@ -233,8 +249,8 @@ class RecordBuilder(object): if 'regexp' in criteria: for crit_key, crit_values in criteria['regexp'].items(): - for crit_re in crit_values: - matches[classtype] += 1 if crit_re.search(entry[crit_key]) else 0 + for (crit_re, score) in crit_values: + matches[classtype] += score if crit_re.search(entry[crit_key]) else 0 matches = list(matches.items()) diff --git a/scripts/pyaccuwage-pdfparse b/scripts/pyaccuwage-pdfparse index 1b147c3..3cb781b 100755 --- a/scripts/pyaccuwage-pdfparse +++ b/scripts/pyaccuwage-pdfparse @@ -21,6 +21,8 @@ def generate_imports(): ]) def generate_class_begin(name): + name = re.sub(r"^[\d]*", "", name) + print name return "class %s(mode.Model):\n" % name if args.full: @@ -67,7 +69,9 @@ for rec in records: #print last_record_ends_at + 1, begins_at if last_record_ends_at + 1 != begins_at: - sys.stdout.write("\nclass %s(object):\n" % re.sub('[^\w]','',rec[0].split(':')[-1])) + name = re.sub('^[^a-zA-Z]*','',rec[0].split(':')[-1]) + name = re.sub('[^\w]*', '', name) + sys.stdout.write("\nclass %s(object):\n" % name) for field in builder.load(map(lambda x:x.tuple, rec[1][0:])): sys.stdout.write('\t' + field + '\n') diff --git a/setup.py b/setup.py index 3e5bdf9..66da92f 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ from distutils.core import setup setup(name='pyaccuwage', - version='0.2012.0', + version='0.2012.1', packages=['pyaccuwage'], scripts=['scripts/pyaccuwage-parse', 'scripts/pyaccuwage-pdfparse'], zip_safe=True,