from __future__ import unicode_literals
import re
[docs]def find_start(text, heading, index):
"""Find the start of an appendix, supplement, etc."""
match = re.search(r'^{0} {1}'.format(heading, index), text, re.MULTILINE)
if match:
return match.start()
[docs]def find_offsets(text, search_fn):
"""Find the start and end of an appendix, supplement, etc."""
start = search_fn(text)
if start is None or start == -1:
return None
post_start_text = text[start + 1:]
end = search_fn(post_start_text)
if end and end > -1:
return (start, start + end + 1)
else:
return (start, len(text))
[docs]def segments(text, offsets_fn, exclude=None):
"""Split a block of text into a list of its sub parts. Often this means
calling the offsets function repeatedly until there is no more text to
process."""
exclude = exclude or []
segs = []
seg_id = 0
remaining_text = text
text_offset = 0
offsets = offsets_fn(remaining_text, seg_id, exclude)
while offsets:
begin, end = offsets
segs.append((begin + text_offset, end + text_offset))
seg_id += 1
text_offset += end
remaining_text = remaining_text[end:]
exclude = [(e[0] - end, e[1] - end) for e in exclude]
offsets = offsets_fn(remaining_text, seg_id, exclude)
return segs