diff --git a/pyaccuwage/pdfextract.py b/pyaccuwage/pdfextract.py index 3f6cdfd..9bb5e54 100644 --- a/pyaccuwage/pdfextract.py +++ b/pyaccuwage/pdfextract.py @@ -42,18 +42,15 @@ class PDFRecordFinder(object): def find_fields(self, row_iter): cc = ColumnCollector() + blank_row_counter = 0 for r in row_iter: row = self.extract_columns_from_row(r.decode('UTF-8')) if not row: + cc.empty_row() continue - #if cc.is_next_field(row): - # print len(cc.data) - # yield cc - # cc = ColumnCollector() - try: cc.add(row) except IsNextField, e: @@ -137,9 +134,13 @@ class ColumnCollector(object): self.column_widths = None self.max_data_length = 0 self.adjust_pad = 3 + self.empty_rows = 0 pass def add(self, data): + if self.empty_rows > 2: + raise IsNextField() + if not self.data: self.data = dict(data) else: @@ -151,6 +152,9 @@ class ColumnCollector(object): self.update_column_widths(data) + def empty_row(self): + self.empty_rows += 1 + def update_column_widths(self, data): self.last_data_length = len(data) self.max_data_length = max(self.max_data_length, len(data)) @@ -177,6 +181,7 @@ class ColumnCollector(object): def adjust_columns(self, data): adjusted_data = {} + for col_id, value in data: if col_id in self.data.keys(): adjusted_data[col_id] = value.strip() @@ -245,6 +250,9 @@ class ColumnCollector(object): @property def tuple(self): - return tuple(map(lambda k:self.data[k], sorted(self.data.keys()))) - + try: + return tuple(map(lambda k:self.data[k], sorted(self.data.keys()))) + except: + import pdb + pdb.set_trace() diff --git a/scripts/pyaccuwage-pdfparse b/scripts/pyaccuwage-pdfparse index 7cb05cb..897ef53 100755 --- a/scripts/pyaccuwage-pdfparse +++ b/scripts/pyaccuwage-pdfparse @@ -32,23 +32,46 @@ doc = PDFRecordFinder(source_file) records = doc.records() builder = RecordBuilder() -def record_begins_at(record): - return int(record[1][1].data.values()[0].split('-')[0], 10) +def record_begins_at(field): + return int(fields[0].data.values()[0].split('-')[0], 10) -def record_ends_at(record): - return record[1][-1].data - return int(record[1][-1].data.values()[0].split('-')[-1], 10) +def record_ends_at(fields): + return int(fields[-1].data.values()[0].split('-')[-1], 10) + +last_record_begins_at = -1 +last_record_ends_at = -1 for rec in records: + #if not rec[1]: + # continue # no actual fields detected + fields = rec[1] + + # strip out fields that are not 4 items long + fields = filter(lambda x:len(x.tuple) == 4, fields) + + # strip fields that don't begin at position 0 + fields = filter(lambda x: 0 in x.data, fields) + + # strip fields that don't have a length-range type item in position 0 + fields = filter(lambda x: re.match('^\d+[-]?\d*$', x.data[0]), fields) + + if not fields: + continue + + begins_at = record_begins_at(fields) + ends_at = record_ends_at(fields) - print record_begins_at(rec) #, 'to', record_ends_at(rec) # FIXME record_ends_at is randomly exploding due to record data being # a lump of text and not necessarily a field entry. I assume # this is cleaned out by the record builder class. - sys.stdout.write("class %s(object):\n" % re.sub('[^\w]','',rec[0])) + print last_record_ends_at + 1, begins_at + #if last_record_ends_at + 1 != begins_at: + sys.stdout.write("\nclass %s(object):\n" % re.sub('[^\w]','',rec[0])) for field in builder.load(map(lambda x:x.tuple, rec[1][1:])): sys.stdout.write('\t' + field + '\n') #print field + last_record_ends_at = ends_at +