diff --git a/pyaccuwage/pdfextract.py b/pyaccuwage/pdfextract.py index 12c99e6..ef9a023 100644 --- a/pyaccuwage/pdfextract.py +++ b/pyaccuwage/pdfextract.py @@ -22,7 +22,6 @@ class PDFRecordFinder(object): self.field_heading_exp = field_heading_exp def records(self): - #headings = self.locate_heading_rows() headings = self.locate_heading_rows_by_field() for (start, end, name) in headings: @@ -55,7 +54,7 @@ class PDFRecordFinder(object): position -= 1 name = ''.join(header).strip().decode('ascii','ignore') - results.append((i, name)) + results.append((i, name, position)) else: # See if this row forces us to break from field reading. if re.search('Record\ Layout', row): @@ -65,7 +64,7 @@ class PDFRecordFinder(object): for (a, b) in zip(results, results[1:] + [(len(self.textrows), None)]): end_pos = None - print a[0], record_break[0], b[0]-1 + #print a[0], record_break[0], b[0]-1 while record_break and record_break[0] < a[0]: record_break = record_break[1:] @@ -76,7 +75,7 @@ class PDFRecordFinder(object): else: end_pos = b[0]-1 - merged.append( (a[0], end_pos, a[1]) ) + merged.append( (a[0], end_pos-1, a[1]) ) return merged def locate_heading_rows(self): @@ -110,7 +109,6 @@ class PDFRecordFinder(object): for r in row_iter: row = self.extract_columns_from_row(r.decode('UTF-8')) - if not row: cc.empty_row() continue @@ -249,7 +247,6 @@ class ColumnCollector(object): def adjust_columns(self, data): adjusted_data = {} - for col_id, value in data: if col_id in self.data.keys(): adjusted_data[col_id] = value.strip() diff --git a/scripts/pyaccuwage-pdfparse b/scripts/pyaccuwage-pdfparse index 897ef53..1b147c3 100755 --- a/scripts/pyaccuwage-pdfparse +++ b/scripts/pyaccuwage-pdfparse @@ -65,11 +65,11 @@ for rec in records: # a lump of text and not necessarily a field entry. I assume # this is cleaned out by the record builder class. - print last_record_ends_at + 1, begins_at - #if last_record_ends_at + 1 != begins_at: - sys.stdout.write("\nclass %s(object):\n" % re.sub('[^\w]','',rec[0])) + #print last_record_ends_at + 1, begins_at + if last_record_ends_at + 1 != begins_at: + sys.stdout.write("\nclass %s(object):\n" % re.sub('[^\w]','',rec[0].split(':')[-1])) - for field in builder.load(map(lambda x:x.tuple, rec[1][1:])): + for field in builder.load(map(lambda x:x.tuple, rec[1][0:])): sys.stdout.write('\t' + field + '\n') #print field