diff --git a/pyaccuwage/pdfextract.py b/pyaccuwage/pdfextract.py index d10e6cb..12c99e6 100644 --- a/pyaccuwage/pdfextract.py +++ b/pyaccuwage/pdfextract.py @@ -11,9 +11,9 @@ import pdb class PDFRecordFinder(object): def __init__(self, src, heading_exp=None): if not heading_exp: - heading_exp = re.compile('(\s+Record Name: (.*)|\s+(.*Record Layout.*)') + heading_exp = re.compile('\s+Record Name: (.*)') - field_heading_exp = re.compile('^Field.*Field.*Length.*Descrition') + field_heading_exp = re.compile('^Field.*Field.*Length.*Description') opts = ["pdftotext", "-layout", "-nopgbrk", "-eol", "unix", src, '-'] pdftext = subprocess.check_output(opts) @@ -27,26 +27,57 @@ class PDFRecordFinder(object): for (start, end, name) in headings: name = name.decode('ascii', 'ignore') - yield (name, list(self.find_fields(iter(self.textrows[start+1:end])))) + yield (name, list(self.find_fields(iter(self.textrows[start+1:end]))), (start+1, end)) def locate_heading_rows_by_field(self): results = [] + record_break = [] for (i, row) in enumerate(self.textrows): match = self.field_heading_exp.match(row) if match: # work backwards until we think the header is fully copied space_count_exp = re.compile('^(\s*)') position = i - 1 - last_spaces = space_count_exp.search(self.textrows[position] + last_spaces = 10000 complete = False + header = None while not complete: - position -= 1 - spaces = space_count_exp.search(self.textrows[position]) - if spaces > last_spaces: - print 'HEADER', self.textrows[position:i] - complete = True + if len(self.textrows[position].strip()) == 0: + spaces = 10000 + else: + spaces = space_count_exp.search(self.textrows[position]).end() + if spaces > last_spaces: + header = self.textrows[position + 1:i] + complete = True + last_spaces = spaces + position -= 1 + + name = ''.join(header).strip().decode('ascii','ignore') + results.append((i, name)) + else: + # See if this row forces us to break from field reading. + if re.search('Record\ Layout', row): + record_break.append(i) + + merged = [] + for (a, b) in zip(results, results[1:] + [(len(self.textrows), None)]): + end_pos = None + + print a[0], record_break[0], b[0]-1 + + while record_break and record_break[0] < a[0]: + record_break = record_break[1:] + + if record_break[0] < b[0]-1: + end_pos = record_break[0] + record_break = record_break[1:] + else: + end_pos = b[0]-1 + + merged.append( (a[0], end_pos, a[1]) ) + return merged def locate_heading_rows(self): results = [] @@ -175,8 +206,8 @@ class ColumnCollector(object): return "<%s: %s>" % (self.__class__.__name__, map(lambda x:x if len(x) < 25 else x[:25] + '..', self.data.values())) def add(self, data): - if self.empty_rows > 2: - raise IsNextField() + #if self.empty_rows > 2: + # raise IsNextField() if not self.data: self.data = dict(data)