Merge branch 'master' of brimstone.klowner.com:pyaccuwage
Conflicts: pyaccuwage/pdfextract.py
This commit is contained in:
commit
456c15eb1c
6 changed files with 85 additions and 74 deletions
|
@ -65,7 +65,7 @@ class TextField(Field):
|
||||||
|
|
||||||
|
|
||||||
class StateField(TextField):
|
class StateField(TextField):
|
||||||
def __init__(self, name=None, required=True, use_numeric=False):
|
def __init__(self, name=None, required=True, use_numeric=False, max_length=2):
|
||||||
super(StateField, self).__init__(name=name, max_length=2, required=required)
|
super(StateField, self).__init__(name=name, max_length=2, required=required)
|
||||||
self.use_numeric = use_numeric
|
self.use_numeric = use_numeric
|
||||||
|
|
||||||
|
|
|
@ -6,6 +6,7 @@ import pdb
|
||||||
class Model(object):
|
class Model(object):
|
||||||
record_identifier = ' '
|
record_identifier = ' '
|
||||||
required = False
|
required = False
|
||||||
|
target_size = 512
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
for (key, value) in self.__class__.__dict__.items():
|
for (key, value) in self.__class__.__dict__.items():
|
||||||
|
@ -52,10 +53,16 @@ class Model(object):
|
||||||
custom_validator(f)
|
custom_validator(f)
|
||||||
|
|
||||||
def output(self):
|
def output(self):
|
||||||
result = ''.join([self.record_identifier] + [field.get_data() for field in self.get_sorted_fields()])
|
result = ''.join([self.record_identifier] +
|
||||||
if len(result) != 512:
|
[field.get_data() for field in self.get_sorted_fields()])
|
||||||
raise ValidationError("Record result length not equal to 512 bytes (%d)" % len(result))
|
if len(result) != self.record_length:
|
||||||
return result
|
raise ValidationError("Record result length not equal to %d bytes (%d)" % (self.record_length, len(result)))
|
||||||
|
|
||||||
|
#result = ''.join([self.record_identifier] + [field.get_data() for field in self.get_sorted_fields()])
|
||||||
|
#if len(result) != self.target_size:
|
||||||
|
# raise ValidationError("Record result length not equal to %d bytes (%d)" % (self.target_size, len(result)))
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
def read(self, fp):
|
def read(self, fp):
|
||||||
for field in self.get_sorted_fields():
|
for field in self.get_sorted_fields():
|
||||||
|
|
|
@ -132,7 +132,7 @@ class RecordBuilder(object):
|
||||||
(fields.BlankField, {
|
(fields.BlankField, {
|
||||||
'regexp': {
|
'regexp': {
|
||||||
'name': [
|
'name': [
|
||||||
re.compile(r'^blank$'),
|
(re.compile(r'^blank$', re.IGNORECASE), +1),
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
}),
|
}),
|
||||||
|
@ -140,9 +140,13 @@ class RecordBuilder(object):
|
||||||
(fields.MoneyField, {
|
(fields.MoneyField, {
|
||||||
'regexp': {
|
'regexp': {
|
||||||
'desc': [
|
'desc': [
|
||||||
re.compile(r'right\-justified'),
|
(re.compile(r'right\-justif', re.IGNORECASE), +1),
|
||||||
re.compile(r'amount'),
|
(re.compile(r'amount', re.IGNORECASE), +1),
|
||||||
re.compile(r'zero\-filled'),
|
(re.compile(r'zero\-filled', re.IGNORECASE), +1),
|
||||||
|
(re.compile(r'leading zeroes', re.IGNORECASE), +1),
|
||||||
|
|
||||||
|
(re.compile(r'left-\justif', re.IGNORECASE), -1),
|
||||||
|
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
}),
|
}),
|
||||||
|
@ -150,7 +154,7 @@ class RecordBuilder(object):
|
||||||
(fields.TextField, {
|
(fields.TextField, {
|
||||||
'regexp': {
|
'regexp': {
|
||||||
'desc': [
|
'desc': [
|
||||||
re.compile(r'enter blanks'),
|
(re.compile(r'blanks', re.IGNORECASE), +1),
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
}),
|
}),
|
||||||
|
@ -158,12 +162,24 @@ class RecordBuilder(object):
|
||||||
(fields.StateField, {
|
(fields.StateField, {
|
||||||
'regexp': {
|
'regexp': {
|
||||||
'desc': [
|
'desc': [
|
||||||
re.compile(r'state'),
|
(re.compile(r'state', re.IGNORECASE), +1),
|
||||||
re.compile(r'postal'),
|
(re.compile(r'postal', re.IGNORECASE), +1),
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
'length': ['=2'],
|
'length': ['=2'],
|
||||||
})
|
}),
|
||||||
|
|
||||||
|
(fields.IntegerField, {
|
||||||
|
'regexp': {
|
||||||
|
'desc': [
|
||||||
|
(re.compile(r'right\-justif', re.IGNORECASE), +1),
|
||||||
|
(re.compile(r'leading zeroes', re.IGNORECASE), +1),
|
||||||
|
(re.compile(r'number', re.IGNORECASE), +1),
|
||||||
|
|
||||||
|
(re.compile(r'left\-justif', re.IGNORECASE), -1),
|
||||||
|
],
|
||||||
|
},
|
||||||
|
}),
|
||||||
]
|
]
|
||||||
|
|
||||||
def load(self, entries):
|
def load(self, entries):
|
||||||
|
@ -233,8 +249,8 @@ class RecordBuilder(object):
|
||||||
|
|
||||||
if 'regexp' in criteria:
|
if 'regexp' in criteria:
|
||||||
for crit_key, crit_values in criteria['regexp'].items():
|
for crit_key, crit_values in criteria['regexp'].items():
|
||||||
for crit_re in crit_values:
|
for (crit_re, score) in crit_values:
|
||||||
matches[classtype] += 1 if crit_re.search(entry[crit_key]) else 0
|
matches[classtype] += score if crit_re.search(entry[crit_key]) else 0
|
||||||
|
|
||||||
|
|
||||||
matches = list(matches.items())
|
matches = list(matches.items())
|
||||||
|
|
|
@ -24,6 +24,9 @@ class PDFRecordFinder(object):
|
||||||
def records(self):
|
def records(self):
|
||||||
headings = self.locate_heading_rows_by_field()
|
headings = self.locate_heading_rows_by_field()
|
||||||
|
|
||||||
|
#for x in headings:
|
||||||
|
# print x
|
||||||
|
|
||||||
for (start, end, name) in headings:
|
for (start, end, name) in headings:
|
||||||
name = name.decode('ascii', 'ignore')
|
name = name.decode('ascii', 'ignore')
|
||||||
yield (name, list(self.find_fields(iter(self.textrows[start+1:end]))), (start+1, end))
|
yield (name, list(self.find_fields(iter(self.textrows[start+1:end]))), (start+1, end))
|
||||||
|
@ -108,45 +111,30 @@ class PDFRecordFinder(object):
|
||||||
blank_row_counter = 0
|
blank_row_counter = 0
|
||||||
|
|
||||||
for r in row_iter:
|
for r in row_iter:
|
||||||
row = self.extract_columns_from_row(r.decode('UTF-8'))
|
row = r.decode('UTF-8')
|
||||||
if not row:
|
#print row
|
||||||
cc.empty_row()
|
row_columns = self.extract_columns_from_row(row)
|
||||||
|
|
||||||
|
if not row_columns:
|
||||||
|
if cc.data and len(cc.data.keys()) > 1 and len(row.strip()) > cc.data.keys()[-1]:
|
||||||
|
yield cc
|
||||||
|
cc = ColumnCollector()
|
||||||
|
else:
|
||||||
|
cc.empty_row()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
cc.add(row)
|
cc.add(row_columns)
|
||||||
|
|
||||||
except IsNextField, e:
|
except IsNextField, e:
|
||||||
yield cc
|
yield cc
|
||||||
cc = ColumnCollector()
|
cc = ColumnCollector()
|
||||||
cc.add(row)
|
cc.add(row_columns)
|
||||||
except UnknownColumn, e:
|
except UnknownColumn, e:
|
||||||
raise StopIteration
|
raise StopIteration
|
||||||
|
|
||||||
yield cc
|
yield cc
|
||||||
|
|
||||||
def find_fields_old(self, row_iter):
|
|
||||||
cc = ColumnCollector()
|
|
||||||
|
|
||||||
for r in row_iter:
|
|
||||||
row = self.extract_columns_from_row(r.decode('UTF-8'))
|
|
||||||
|
|
||||||
if not row:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if cc.is_next_field(row):
|
|
||||||
#if row[1][1] == 'Vendor Indicator':
|
|
||||||
# import pdb
|
|
||||||
# pdb.set_trace()
|
|
||||||
yield cc
|
|
||||||
cc = ColumnCollector()
|
|
||||||
|
|
||||||
try:
|
|
||||||
cc.add(row)
|
|
||||||
|
|
||||||
except UnknownColumn, e:
|
|
||||||
raise StopIteration
|
|
||||||
yield cc
|
|
||||||
|
|
||||||
|
|
||||||
def extract_columns_from_row(self, row):
|
def extract_columns_from_row(self, row):
|
||||||
re_multiwhite = re.compile(r'\s{2,}')
|
re_multiwhite = re.compile(r'\s{2,}')
|
||||||
|
@ -202,9 +190,9 @@ class ColumnCollector(object):
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return "<%s: %s>" % (
|
return "<%s: %s>" % (
|
||||||
self.__class__.__name__,
|
self.__class__.__name__,
|
||||||
map(lambda x:x if len(x) < 25 else x[:25] + '..',
|
map(lambda x:x if len(x) < 25 else x[:25] + '..',
|
||||||
self.data.values() if self.data else '' ))
|
self.data.values() if self.data else ''))
|
||||||
|
|
||||||
def add(self, data):
|
def add(self, data):
|
||||||
#if self.empty_rows > 2:
|
#if self.empty_rows > 2:
|
||||||
|
@ -249,7 +237,6 @@ class ColumnCollector(object):
|
||||||
|
|
||||||
def adjust_columns(self, data):
|
def adjust_columns(self, data):
|
||||||
adjusted_data = {}
|
adjusted_data = {}
|
||||||
|
|
||||||
for col_id, value in data:
|
for col_id, value in data:
|
||||||
if col_id in self.data.keys():
|
if col_id in self.data.keys():
|
||||||
adjusted_data[col_id] = value.strip()
|
adjusted_data[col_id] = value.strip()
|
||||||
|
@ -318,9 +305,11 @@ class ColumnCollector(object):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def tuple(self):
|
def tuple(self):
|
||||||
try:
|
#try:
|
||||||
|
if self.data:
|
||||||
return tuple(map(lambda k:self.data[k], sorted(self.data.keys())))
|
return tuple(map(lambda k:self.data[k], sorted(self.data.keys())))
|
||||||
except:
|
return ()
|
||||||
import pdb
|
#except:
|
||||||
pdb.set_trace()
|
# import pdb
|
||||||
|
# pdb.set_trace()
|
||||||
|
|
||||||
|
|
|
@ -14,15 +14,12 @@ args = parser.parse_args()
|
||||||
|
|
||||||
def generate_imports():
|
def generate_imports():
|
||||||
return "\n".join([
|
return "\n".join([
|
||||||
"from pyaccuwage import model",
|
"from pyaccuwage import model as pyaccuwagemodel",
|
||||||
"from pyaccuwage.fields import *",
|
"from pyaccuwage.fields import *",
|
||||||
"",
|
"",
|
||||||
"",
|
"",
|
||||||
])
|
])
|
||||||
|
|
||||||
def generate_class_begin(name):
|
|
||||||
return "class %s(mode.Model):\n" % name
|
|
||||||
|
|
||||||
if args.full:
|
if args.full:
|
||||||
sys.stdout.write(generate_imports())
|
sys.stdout.write(generate_imports())
|
||||||
|
|
||||||
|
@ -67,7 +64,9 @@ for rec in records:
|
||||||
|
|
||||||
#print last_record_ends_at + 1, begins_at
|
#print last_record_ends_at + 1, begins_at
|
||||||
if last_record_ends_at + 1 != begins_at:
|
if last_record_ends_at + 1 != begins_at:
|
||||||
sys.stdout.write("\nclass %s(object):\n" % re.sub('[^\w]','',rec[0].split(':')[-1]))
|
name = re.sub('^[^a-zA-Z]*','',rec[0].split(':')[-1])
|
||||||
|
name = re.sub('[^\w]*', '', name)
|
||||||
|
sys.stdout.write("\nclass %s(pyaccuwagemodel.Model):\n" % name)
|
||||||
|
|
||||||
for field in builder.load(map(lambda x:x.tuple, rec[1][0:])):
|
for field in builder.load(map(lambda x:x.tuple, rec[1][0:])):
|
||||||
sys.stdout.write('\t' + field + '\n')
|
sys.stdout.write('\t' + field + '\n')
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -1,6 +1,6 @@
|
||||||
from distutils.core import setup
|
from distutils.core import setup
|
||||||
setup(name='pyaccuwage',
|
setup(name='pyaccuwage',
|
||||||
version='0.2012.0',
|
version='0.2012.1',
|
||||||
packages=['pyaccuwage'],
|
packages=['pyaccuwage'],
|
||||||
scripts=['scripts/pyaccuwage-parse', 'scripts/pyaccuwage-pdfparse'],
|
scripts=['scripts/pyaccuwage-parse', 'scripts/pyaccuwage-pdfparse'],
|
||||||
zip_safe=True,
|
zip_safe=True,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue