diff --git a/pyaccuwage/pdfextract.py b/pyaccuwage/pdfextract.py index 7660912..2499b1f 100644 --- a/pyaccuwage/pdfextract.py +++ b/pyaccuwage/pdfextract.py @@ -11,7 +11,7 @@ import pdb class PDFRecordFinder(object): def __init__(self, src, heading_exp=None): if not heading_exp: - heading_exp = re.compile('\s+Record Name: (.*)') + heading_exp = re.compile('(\s+Record Name: (.*))|Record\ Layout') field_heading_exp = re.compile('^Field.*Field.*Length.*Description') @@ -35,28 +35,29 @@ class PDFRecordFinder(object): def locate_heading_rows_by_field(self): results = [] record_break = [] + line_is_whitespace_exp = re.compile('^(\s*)$') + record_begin_exp = self.heading_exp #re.compile('Record\ Name') + for (i, row) in enumerate(self.textrows): match = self.field_heading_exp.match(row) if match: # work backwards until we think the header is fully copied space_count_exp = re.compile('^(\s*)') position = i - 1 - last_spaces = 10000 + spaces = 0 + #last_spaces = 10000 complete = False header = None while not complete: - if len(self.textrows[position].strip()) == 0: - spaces = 10000 - else: - spaces = space_count_exp.search(self.textrows[position]).end() - - if spaces > last_spaces: - header = self.textrows[position + 1:i] + line_is_whitespace = True if line_is_whitespace_exp.match(self.textrows[position]) else False + is_record_begin = record_begin_exp.search(self.textrows[position]) + if is_record_begin or line_is_whitespace: + header = self.textrows[position-1:i] complete = True - last_spaces = spaces position -= 1 name = ''.join(header).strip().decode('ascii','ignore') + print (name, position) results.append((i, name, position)) else: # See if this row forces us to break from field reading. @@ -81,6 +82,7 @@ class PDFRecordFinder(object): merged.append( (a[0], end_pos-1, a[1]) ) return merged + """ def locate_heading_rows(self): results = [] for (i, row) in enumerate(self.textrows): @@ -95,16 +97,15 @@ class PDFRecordFinder(object): return merged def locate_layout_block_rows(self): - """ - Search for rows that contain "Record Layout", as these are not fields - we are interested in because they contain the crazy blocks of field definitions - and not the nice 4-column ones that we're looking for.""" + # Search for rows that contain "Record Layout", as these are not fields + # we are interested in because they contain the crazy blocks of field definitions + # and not the nice 4-column ones that we're looking for. results = [] for (i, row) in enumerate(self.textrows): match = re.match("Record Layout", row) - + """ def find_fields(self, row_iter): cc = ColumnCollector()