Source code for regparser.search

from __future__ import unicode_literals

import re


[docs]def find_start(text, heading, index): """Find the start of an appendix, supplement, etc.""" match = re.search(r'^{0} {1}'.format(heading, index), text, re.MULTILINE) if match: return match.start()
[docs]def find_offsets(text, search_fn): """Find the start and end of an appendix, supplement, etc.""" start = search_fn(text) if start is None or start == -1: return None post_start_text = text[start + 1:] end = search_fn(post_start_text) if end and end > -1: return (start, start + end + 1) else: return (start, len(text))
[docs]def segments(text, offsets_fn, exclude=None): """Split a block of text into a list of its sub parts. Often this means calling the offsets function repeatedly until there is no more text to process.""" exclude = exclude or [] segs = [] seg_id = 0 remaining_text = text text_offset = 0 offsets = offsets_fn(remaining_text, seg_id, exclude) while offsets: begin, end = offsets segs.append((begin + text_offset, end + text_offset)) seg_id += 1 text_offset += end remaining_text = remaining_text[end:] exclude = [(e[0] - end, e[1] - end) for e in exclude] offsets = offsets_fn(remaining_text, seg_id, exclude) return segs