working better!
This commit is contained in:
parent
e6e087ef38
commit
730073dcd1
1 changed files with 18 additions and 33 deletions
|
@ -108,45 +108,29 @@ class PDFRecordFinder(object):
|
||||||
blank_row_counter = 0
|
blank_row_counter = 0
|
||||||
|
|
||||||
for r in row_iter:
|
for r in row_iter:
|
||||||
row = self.extract_columns_from_row(r.decode('UTF-8'))
|
row = r.decode('UTF-8')
|
||||||
if not row:
|
row_columns = self.extract_columns_from_row(row)
|
||||||
cc.empty_row()
|
|
||||||
|
if not row_columns:
|
||||||
|
if cc.data and len(cc.data.keys()) > 1 and len(row.strip()) > cc.data.keys()[-1]:
|
||||||
|
yield cc
|
||||||
|
cc = ColumnCollector()
|
||||||
|
else:
|
||||||
|
cc.empty_row()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
cc.add(row)
|
cc.add(row_columns)
|
||||||
|
|
||||||
except IsNextField, e:
|
except IsNextField, e:
|
||||||
yield cc
|
yield cc
|
||||||
cc = ColumnCollector()
|
cc = ColumnCollector()
|
||||||
cc.add(row)
|
cc.add(row_columns)
|
||||||
except UnknownColumn, e:
|
except UnknownColumn, e:
|
||||||
raise StopIteration
|
raise StopIteration
|
||||||
|
|
||||||
yield cc
|
yield cc
|
||||||
|
|
||||||
def find_fields_old(self, row_iter):
|
|
||||||
cc = ColumnCollector()
|
|
||||||
|
|
||||||
for r in row_iter:
|
|
||||||
row = self.extract_columns_from_row(r.decode('UTF-8'))
|
|
||||||
|
|
||||||
if not row:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if cc.is_next_field(row):
|
|
||||||
#if row[1][1] == 'Vendor Indicator':
|
|
||||||
# import pdb
|
|
||||||
# pdb.set_trace()
|
|
||||||
yield cc
|
|
||||||
cc = ColumnCollector()
|
|
||||||
|
|
||||||
try:
|
|
||||||
cc.add(row)
|
|
||||||
|
|
||||||
except UnknownColumn, e:
|
|
||||||
raise StopIteration
|
|
||||||
yield cc
|
|
||||||
|
|
||||||
|
|
||||||
def extract_columns_from_row(self, row):
|
def extract_columns_from_row(self, row):
|
||||||
re_multiwhite = re.compile(r'\s{2,}')
|
re_multiwhite = re.compile(r'\s{2,}')
|
||||||
|
@ -246,7 +230,6 @@ class ColumnCollector(object):
|
||||||
|
|
||||||
def adjust_columns(self, data):
|
def adjust_columns(self, data):
|
||||||
adjusted_data = {}
|
adjusted_data = {}
|
||||||
|
|
||||||
for col_id, value in data:
|
for col_id, value in data:
|
||||||
if col_id in self.data.keys():
|
if col_id in self.data.keys():
|
||||||
adjusted_data[col_id] = value.strip()
|
adjusted_data[col_id] = value.strip()
|
||||||
|
@ -315,9 +298,11 @@ class ColumnCollector(object):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def tuple(self):
|
def tuple(self):
|
||||||
try:
|
#try:
|
||||||
|
if self.data:
|
||||||
return tuple(map(lambda k:self.data[k], sorted(self.data.keys())))
|
return tuple(map(lambda k:self.data[k], sorted(self.data.keys())))
|
||||||
except:
|
return ()
|
||||||
import pdb
|
#except:
|
||||||
pdb.set_trace()
|
# import pdb
|
||||||
|
# pdb.set_trace()
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue