diff --git a/pyaccuwage/pdfextract.py b/pyaccuwage/pdfextract.py index 9b7c64a..8d149e0 100644 --- a/pyaccuwage/pdfextract.py +++ b/pyaccuwage/pdfextract.py @@ -11,30 +11,51 @@ import pdb class PDFRecordFinder(object): def __init__(self, src, heading_exp=None): if not heading_exp: - heading_exp = re.compile('\s+Record Name: (.*)') + heading_exp = re.compile('(\s+Record Name: (.*)|\s+(.*Record Layout.*)') + + field_heading_exp = re.compile('^Field.*Field.*Length.*Descrition') opts = ["pdftotext", "-layout", "-nopgbrk", "-eol", "unix", src, '-'] pdftext = subprocess.check_output(opts) self.textrows = pdftext.split('\n') self.heading_exp = heading_exp - + self.field_heading_exp = field_heading_exp def records(self): - headings = self.locate_heading_rows() + #headings = self.locate_heading_rows() + headings = self.locate_heading_rows_by_field() + for (start, end, name) in headings: name = name.decode('ascii', 'ignore') yield (name, list(self.find_fields(iter(self.textrows[start+1:end])))) + def locate_heading_rows_by_field(self): + results = [] + for (i, row) in enumerate(self.textrows): + match = self.field_heading_exp.match(row) + if match: + # work backwards until we think the header is fully copied + space_count_exp = re.compile('^(\s*)') + position = i - 1 + last_spaces = space_count_exp.search(self.textrows[position] + complete = False + while not complete: + position -= 1 + spaces = space_count_exp.search(self.textrows[position]) + if spaces > last_spaces: + print 'HEADER', self.textrows[position:i] + complete = True + + def locate_heading_rows(self): results = [] for (i, row) in enumerate(self.textrows): match = self.heading_exp.match(row) if match: - print i,match.groups() - #if not ''.join(match.groups()).lower().endswith('(continued)'): + #print i,match.groups() results.append((i, ''.join(match.groups()))) - + """ results2 = [] for r in results: @@ -49,25 +70,36 @@ class PDFRecordFinder(object): return merged + def locate_layout_block_rows(self): + """ + Search for rows that contain "Record Layout", as these are not fields + we are interested in because they contain the crazy blocks of field definitions + and not the nice 4-column ones that we're looking for.""" + + results = [] + for (i, row) in enumerate(self.textrows): + match = re.match("Record Layout", row) + + def find_fields(self, row_iter): cc = ColumnCollector() for r in row_iter: row = self.extract_columns_from_row(r.decode('UTF-8')) - + if not row: continue - + if cc.is_next_field(row): - if row[1][1] == 'Vendor Indicator': - import pdb - pdb.set_trace() + #if row[1][1] == 'Vendor Indicator': + # import pdb + # pdb.set_trace() yield cc cc = ColumnCollector() - + try: cc.add(row) - + except UnknownColumn, e: raise StopIteration yield cc @@ -102,13 +134,14 @@ class PDFRecordFinder(object): row_result.append( (start, row[start:end].encode('ascii','ignore')) ) - + except StopIteration: white_iter = None + #print row_result return row_result - - + + class UnknownColumn(Exception): pass @@ -128,12 +161,12 @@ class ColumnCollector(object): raise IsNextField() for col_id, value in data: self.merge_column(col_id, value) - + def merge_column(self, col_id, value): if col_id in self.data.keys(): self.data[col_id] += ' ' + value.strip() - - else: + + else: # try adding a wiggle room value? # FIXME: # Sometimes description columns contain column-like @@ -143,7 +176,7 @@ class ColumnCollector(object): # max column? raise UnknownColumn - + def is_next_field(self, data): """ If the first key value contains a string