From 730073dcd12c9b3fdc2b3f030432fc6bc672bc71 Mon Sep 17 00:00:00 2001 From: Binh Nguyen Date: Tue, 5 Feb 2013 15:43:04 -0600 Subject: [PATCH 1/5] working better! --- pyaccuwage/pdfextract.py | 51 ++++++++++++++-------------------------- 1 file changed, 18 insertions(+), 33 deletions(-) diff --git a/pyaccuwage/pdfextract.py b/pyaccuwage/pdfextract.py index ef9a023..42cc9dd 100644 --- a/pyaccuwage/pdfextract.py +++ b/pyaccuwage/pdfextract.py @@ -108,45 +108,29 @@ class PDFRecordFinder(object): blank_row_counter = 0 for r in row_iter: - row = self.extract_columns_from_row(r.decode('UTF-8')) - if not row: - cc.empty_row() + row = r.decode('UTF-8') + row_columns = self.extract_columns_from_row(row) + + if not row_columns: + if cc.data and len(cc.data.keys()) > 1 and len(row.strip()) > cc.data.keys()[-1]: + yield cc + cc = ColumnCollector() + else: + cc.empty_row() continue try: - cc.add(row) + cc.add(row_columns) + except IsNextField, e: yield cc cc = ColumnCollector() - cc.add(row) + cc.add(row_columns) except UnknownColumn, e: raise StopIteration yield cc - def find_fields_old(self, row_iter): - cc = ColumnCollector() - - for r in row_iter: - row = self.extract_columns_from_row(r.decode('UTF-8')) - - if not row: - continue - - if cc.is_next_field(row): - #if row[1][1] == 'Vendor Indicator': - # import pdb - # pdb.set_trace() - yield cc - cc = ColumnCollector() - - try: - cc.add(row) - - except UnknownColumn, e: - raise StopIteration - yield cc - def extract_columns_from_row(self, row): re_multiwhite = re.compile(r'\s{2,}') @@ -246,7 +230,6 @@ class ColumnCollector(object): def adjust_columns(self, data): adjusted_data = {} - for col_id, value in data: if col_id in self.data.keys(): adjusted_data[col_id] = value.strip() @@ -315,9 +298,11 @@ class ColumnCollector(object): @property def tuple(self): - try: + #try: + if self.data: return tuple(map(lambda k:self.data[k], sorted(self.data.keys()))) - except: - import pdb - pdb.set_trace() + return () + #except: + # import pdb + # pdb.set_trace() From b40e736ae08bcb96d1a934a98cde1a0a3033b55a Mon Sep 17 00:00:00 2001 From: Binh Nguyen Date: Tue, 19 Feb 2013 15:55:05 -0600 Subject: [PATCH 2/5] bumping version, improving field type guessing --- pyaccuwage/fields.py | 34 +++++++++++++++++----------------- pyaccuwage/parser.py | 36 ++++++++++++++++++++++++++---------- scripts/pyaccuwage-pdfparse | 6 +++++- setup.py | 2 +- 4 files changed, 49 insertions(+), 29 deletions(-) diff --git a/pyaccuwage/fields.py b/pyaccuwage/fields.py index 9e38448..960db3e 100644 --- a/pyaccuwage/fields.py +++ b/pyaccuwage/fields.py @@ -12,7 +12,7 @@ class ValidationError(Exception): return "(%s.%s) %s" % (self.field.parent_name, self.field.name, self.msg) else: return repr(self.msg) - + class Field(object): creation_counter = 0 @@ -31,13 +31,13 @@ class Field(object): def get_data(self): raise NotImplemented - + def __setvalue(self, value): self._value = value - def __getvalue(self): + def __getvalue(self): return self._value - + value = property(__getvalue, __setvalue) def read(self, fp): @@ -50,7 +50,7 @@ class Field(object): self.value = s.strip() -class TextField(Field): +class TextField(Field): def validate(self): if self.value == None and self.required: raise ValidationError("value required", field=self) @@ -65,7 +65,7 @@ class TextField(Field): class StateField(TextField): - def __init__(self, name=None, required=True, use_numeric=False): + def __init__(self, name=None, required=True, use_numeric=False, max_length=2): super(StateField, self).__init__(name=name, max_length=2, required=required) self.use_numeric = use_numeric @@ -90,7 +90,7 @@ class StateField(TextField): class EmailField(TextField): def __init__(self, name=None, required=True, max_length=None): - return super(EmailField, self).__init__(name=name, max_length=max_length, + return super(EmailField, self).__init__(name=name, max_length=max_length, required=required, uppercase=False) class IntegerField(TextField): @@ -101,7 +101,7 @@ class IntegerField(TextField): int(self.value) except ValueError: raise ValidationError("field contains non-numeric characters", field=self) - + def get_data(self): value = self.value or "" @@ -123,7 +123,7 @@ class StaticField(TextField): class BlankField(TextField): def __init__(self, name=None, max_length=0, required=False): super(TextField, self).__init__(name=name, max_length=max_length, required=required, uppercase=False) - + def get_data(self): return " " * self.max_length @@ -161,7 +161,7 @@ class MoneyField(Field): class DateField(TextField): def __init__(self, name=None, required=True, value=None): - super(TextField, self).__init__(name=name, required=required, max_length=8) + super(TextField, self).__init__(name=name, required=required, max_length=8) if value: self.value = value @@ -169,7 +169,7 @@ class DateField(TextField): if self._value: return self._value.strftime('%m%d%Y') return '0' * self.max_length - + def parse(self, s): if int(s) > 0: self.value = datetime.date(*[int(x) for x in s[4:8], s[0:2], s[2:4]]) @@ -184,9 +184,9 @@ class DateField(TextField): else: self._value = None - def __getvalue(self): + def __getvalue(self): return self._value - + value = property(__getvalue, __setvalue) @@ -196,7 +196,7 @@ class MonthYearField(TextField): if value: self.value = value - + def get_data(self): if self._value: return self._value.strftime("%m%Y") @@ -215,9 +215,9 @@ class MonthYearField(TextField): self._value = datetime.date(*[int(x) for x in value[2:6], value[0:2], 1]) else: self._value = None - - def __getvalue(self): + + def __getvalue(self): return self._value - + value = property(__getvalue, __setvalue) diff --git a/pyaccuwage/parser.py b/pyaccuwage/parser.py index 1b475ca..b367db1 100644 --- a/pyaccuwage/parser.py +++ b/pyaccuwage/parser.py @@ -132,7 +132,7 @@ class RecordBuilder(object): (fields.BlankField, { 'regexp': { 'name': [ - re.compile(r'^blank$'), + (re.compile(r'^blank$', re.IGNORECASE), +1), ], }, }), @@ -140,9 +140,13 @@ class RecordBuilder(object): (fields.MoneyField, { 'regexp': { 'desc': [ - re.compile(r'right\-justified'), - re.compile(r'amount'), - re.compile(r'zero\-filled'), + (re.compile(r'right\-justif', re.IGNORECASE), +1), + (re.compile(r'amount', re.IGNORECASE), +1), + (re.compile(r'zero\-filled', re.IGNORECASE), +1), + (re.compile(r'leading zeroes', re.IGNORECASE), +1), + + (re.compile(r'left-\justif', re.IGNORECASE), -1), + ], }, }), @@ -150,7 +154,7 @@ class RecordBuilder(object): (fields.TextField, { 'regexp': { 'desc': [ - re.compile(r'enter blanks'), + (re.compile(r'blanks', re.IGNORECASE), +1), ], }, }), @@ -158,12 +162,24 @@ class RecordBuilder(object): (fields.StateField, { 'regexp': { 'desc': [ - re.compile(r'state'), - re.compile(r'postal'), + (re.compile(r'state', re.IGNORECASE), +1), + (re.compile(r'postal', re.IGNORECASE), +1), ], }, 'length': ['=2'], - }) + }), + + (fields.IntegerField, { + 'regexp': { + 'desc': [ + (re.compile(r'right\-justif', re.IGNORECASE), +1), + (re.compile(r'leading zeroes', re.IGNORECASE), +1), + (re.compile(r'number', re.IGNORECASE), +1), + + (re.compile(r'left\-justif', re.IGNORECASE), -1), + ], + }, + }), ] def load(self, entries): @@ -233,8 +249,8 @@ class RecordBuilder(object): if 'regexp' in criteria: for crit_key, crit_values in criteria['regexp'].items(): - for crit_re in crit_values: - matches[classtype] += 1 if crit_re.search(entry[crit_key]) else 0 + for (crit_re, score) in crit_values: + matches[classtype] += score if crit_re.search(entry[crit_key]) else 0 matches = list(matches.items()) diff --git a/scripts/pyaccuwage-pdfparse b/scripts/pyaccuwage-pdfparse index 1b147c3..3cb781b 100755 --- a/scripts/pyaccuwage-pdfparse +++ b/scripts/pyaccuwage-pdfparse @@ -21,6 +21,8 @@ def generate_imports(): ]) def generate_class_begin(name): + name = re.sub(r"^[\d]*", "", name) + print name return "class %s(mode.Model):\n" % name if args.full: @@ -67,7 +69,9 @@ for rec in records: #print last_record_ends_at + 1, begins_at if last_record_ends_at + 1 != begins_at: - sys.stdout.write("\nclass %s(object):\n" % re.sub('[^\w]','',rec[0].split(':')[-1])) + name = re.sub('^[^a-zA-Z]*','',rec[0].split(':')[-1]) + name = re.sub('[^\w]*', '', name) + sys.stdout.write("\nclass %s(object):\n" % name) for field in builder.load(map(lambda x:x.tuple, rec[1][0:])): sys.stdout.write('\t' + field + '\n') diff --git a/setup.py b/setup.py index 3e5bdf9..66da92f 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ from distutils.core import setup setup(name='pyaccuwage', - version='0.2012.0', + version='0.2012.1', packages=['pyaccuwage'], scripts=['scripts/pyaccuwage-parse', 'scripts/pyaccuwage-pdfparse'], zip_safe=True, From afc4138898faf0e901ca08823465679e176fbbe4 Mon Sep 17 00:00:00 2001 From: Binh Nguyen Date: Tue, 19 Feb 2013 16:06:11 -0600 Subject: [PATCH 3/5] fixed automatic model generation inheretence --- scripts/pyaccuwage-pdfparse | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/scripts/pyaccuwage-pdfparse b/scripts/pyaccuwage-pdfparse index 3cb781b..6a35387 100755 --- a/scripts/pyaccuwage-pdfparse +++ b/scripts/pyaccuwage-pdfparse @@ -14,17 +14,12 @@ args = parser.parse_args() def generate_imports(): return "\n".join([ - "from pyaccuwage import model", + "from pyaccuwage import model as pyaccuwagemodel", "from pyaccuwage.fields import *", "", "", ]) -def generate_class_begin(name): - name = re.sub(r"^[\d]*", "", name) - print name - return "class %s(mode.Model):\n" % name - if args.full: sys.stdout.write(generate_imports()) @@ -71,7 +66,7 @@ for rec in records: if last_record_ends_at + 1 != begins_at: name = re.sub('^[^a-zA-Z]*','',rec[0].split(':')[-1]) name = re.sub('[^\w]*', '', name) - sys.stdout.write("\nclass %s(object):\n" % name) + sys.stdout.write("\nclass %s(pyaccuwagemodel.Model):\n" % name) for field in builder.load(map(lambda x:x.tuple, rec[1][0:])): sys.stdout.write('\t' + field + '\n') From a1ab6b49186114da31e128a14d1c36e6fc7bed41 Mon Sep 17 00:00:00 2001 From: Binh Nguyen Date: Tue, 5 Mar 2013 14:49:38 -0600 Subject: [PATCH 4/5] Looks like 1220 form has changed since last year, work on getting changes applied in a simple manner. --- pyaccuwage/model.py | 7 ++++--- pyaccuwage/pdfextract.py | 9 ++++++++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/pyaccuwage/model.py b/pyaccuwage/model.py index 9e206db..1070653 100644 --- a/pyaccuwage/model.py +++ b/pyaccuwage/model.py @@ -6,6 +6,7 @@ import pdb class Model(object): record_identifier = ' ' required = False + target_size = 512 def __init__(self): for (key, value) in self.__class__.__dict__.items(): @@ -32,7 +33,7 @@ class Model(object): for key in self.__class__.__dict__.keys(): attr = getattr(self, key) if isinstance(attr, Field): - fields.append(attr) + fields.append(attr) return fields def get_sorted_fields(self): @@ -53,8 +54,8 @@ class Model(object): def output(self): result = ''.join([self.record_identifier] + [field.get_data() for field in self.get_sorted_fields()]) - if len(result) != 512: - raise ValidationError("Record result length not equal to 512 bytes (%d)" % len(result)) + if len(result) != self.target_size: + raise ValidationError("Record result length not equal to %d bytes (%d)" % (self.target_size, len(result))) return result def read(self, fp): diff --git a/pyaccuwage/pdfextract.py b/pyaccuwage/pdfextract.py index 42cc9dd..7660912 100644 --- a/pyaccuwage/pdfextract.py +++ b/pyaccuwage/pdfextract.py @@ -24,6 +24,9 @@ class PDFRecordFinder(object): def records(self): headings = self.locate_heading_rows_by_field() + #for x in headings: + # print x + for (start, end, name) in headings: name = name.decode('ascii', 'ignore') yield (name, list(self.find_fields(iter(self.textrows[start+1:end]))), (start+1, end)) @@ -109,6 +112,7 @@ class PDFRecordFinder(object): for r in row_iter: row = r.decode('UTF-8') + #print row row_columns = self.extract_columns_from_row(row) if not row_columns: @@ -185,7 +189,10 @@ class ColumnCollector(object): pass def __repr__(self): - return "<%s: %s>" % (self.__class__.__name__, map(lambda x:x if len(x) < 25 else x[:25] + '..', self.data.values())) + return "<%s: %s>" % ( + self.__class__.__name__, + map(lambda x:x if len(x) < 25 else x[:25] + '..', + self.data.values() if self.data else '')) def add(self, data): #if self.empty_rows > 2: From d058e64d26dee701125d865c8499dab657adf4a7 Mon Sep 17 00:00:00 2001 From: Binh Nguyen Date: Wed, 20 Mar 2013 15:13:44 -0500 Subject: [PATCH 5/5] tweaking validation --- pyaccuwage/model.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pyaccuwage/model.py b/pyaccuwage/model.py index 9e206db..044ba27 100644 --- a/pyaccuwage/model.py +++ b/pyaccuwage/model.py @@ -6,6 +6,7 @@ import pdb class Model(object): record_identifier = ' ' required = False + record_length = 512 def __init__(self): for (key, value) in self.__class__.__dict__.items(): @@ -52,9 +53,10 @@ class Model(object): custom_validator(f) def output(self): - result = ''.join([self.record_identifier] + [field.get_data() for field in self.get_sorted_fields()]) - if len(result) != 512: - raise ValidationError("Record result length not equal to 512 bytes (%d)" % len(result)) + result = ''.join([self.record_identifier] + + [field.get_data() for field in self.get_sorted_fields()]) + if len(result) != self.record_length: + raise ValidationError("Record result length not equal to %d bytes (%d)" % (self.record_length, len(result))) return result def read(self, fp):