diff --git a/pyaccuwage/pdfextract.py b/pyaccuwage/pdfextract.py index 365d322..eb2a9f0 100644 --- a/pyaccuwage/pdfextract.py +++ b/pyaccuwage/pdfextract.py @@ -7,6 +7,7 @@ import pdb """ pdftotext -layout -nopgbrk p1220.pdf - """ + class PDFRecordFinder(object): def __init__(self, src, heading_exp=None): if not heading_exp: @@ -16,21 +17,49 @@ class PDFRecordFinder(object): pdftext = subprocess.check_output(opts) self.textrows = pdftext.split('\n') self.heading_exp = heading_exp - - @property - def rows(self): - for row in self.textrows: - yield row - + + """ @property def records(self): - row_iter = self.rows - for r in row_iter: - if self.heading_exp.match(r): - record = self.extract_record(row_iter) - yield record + row_iter = iter(self.textrows) + try: + while 1: + row = row_iter.next() + if self.heading_exp.match(row): + record = self.extract_record(row_iter) + yield record + except Exception, e: + raise e + #for r in self.textrows: + # if self.heading_exp.match(r): + # record = self.extract_record() + # yield record + """ - def extract_record(self, row_iter): + + def columns(self): + results = [] + cc = ColumnCollector() + for group in self.record_grouping(): + for row in group: + if cc.is_next_record(row): + yield cc + cc = ColumnCollector() + print row + cc.add(row) + + + + def record_grouping(self): + row_iter = iter(self.textrows) + i = 0 + for row in row_iter: + i += 1 + if self.heading_exp.match(row): + yield self.extract_record_columns(row_iter) + + + def extract_record_columns(self, row_iter): re_multiwhite = re.compile(r'\s{2,}') result = [] full_width_text_count = 0 @@ -40,6 +69,9 @@ class PDFRecordFinder(object): if not row: continue + #if row.strip().startswith('Code'): + # pdb.set_trace() + # IF LINE CONTAINS MULTIPLE WHITESPACE, IT IS PROBABLY A TABLE if not re_multiwhite.search(row): full_width_text_count += 1 @@ -48,7 +80,39 @@ class PDFRecordFinder(object): # return result continue - #pdb.set_trace() + white_ranges = [0,] + pos = 0 + match = True + while pos < len(row): + match = re_multiwhite.search(row[pos:]) + if match: + white_ranges.append(pos + match.start()) + white_ranges.append(pos + match.end()) + pos += match.end() + else: + white_ranges.append(len(row)) + pos = len(row) + + + row_result = [] + white_iter = iter(white_ranges) + while white_iter: + try: + start = white_iter.next() + end = white_iter.next() + if start != end: + row_result.append( + (start, row[start:end]) + ) + + except StopIteration: + white_iter = None + print white_ranges + yield row_result + result.append(row_result) + + + """ row_result = [] pos = 0 while pos < len(row): @@ -61,9 +125,56 @@ class PDFRecordFinder(object): else: if match: pos += match.end() - row_result.append((pos,row[pos:],'b')) + row_result.append((pos,row[pos:])) pos += len(row) - result.append(row_result) row_result = [] - return result + """ + #return result + + +class UnknownColumn(Exception): + pass + +class IsNextRecord(Exception): + pass + +class ColumnCollector(object): + def __init__(self, initial=None): + self.data = None + pass + + def add(self, data): + if not self.data: + self.data = dict(data) + else: + if self.is_next_record(data): + raise IsNextRecord() + for col_id, value in data: + self.merge_column(col_id, value) + + def merge_column(self, col_id, value): + if col_id in self.data.keys(): + self.data[col_id] += ' ' + value.strip() + + else: + # try adding a wiggle room value? + raise UnknownColumn + + def is_next_record(self, data): + """ + If the first key value contains a string + and we already have some data in the record, + then this row is probably the beginning of + the next field. Raise an exception and continue + on with a fresh ColumnCollector. + """ + first_key = dict(data).keys()[0] + if self.data: + print self.data.keys()[0], first_key + return self.data.keys()[0] == first_key + return False + + + +