Last active
February 11, 2021 15:52
-
-
Save Nempickaxe/1dd4a01b692dba82bacac3c897e5ceb4 to your computer and use it in GitHub Desktop.
split a sentence based on maximum character width of sentences
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_interval(space_list, width): | |
for i in range(len(space_list)-1): | |
if space_list[i+1]>width: | |
return space_list[i] | |
else: | |
continue | |
return space_list[-1] | |
def get_subtracted_list(space_list, width): | |
return list(map(lambda x: int(((x-width)+abs(x-width))/2), space_list)) | |
def get_all_breaks(space_list, width): | |
space_list_copy = space_list.copy() | |
split_indices = [] | |
last_space_ind = 10 #random positve number | |
while last_space_ind>0: | |
last_space_ind = get_interval(space_list_copy, width) | |
split_indices.append(last_space_ind) | |
space_list_copy = get_subtracted_list(space_list_copy, last_space_ind) | |
#non-cumulative list | |
split_indices = split_indices[:-1] | |
return split_indices | |
def splitting(x, width=400): | |
sentence_tray = [] | |
copy_x = x | |
space_list = [m.start() for m in re.finditer(' ', x)] | |
if not space_list: | |
space_list = [len(x)] | |
all_breaks = get_all_breaks(space_list, width=width) | |
for item in all_breaks: | |
sentence_tray.append(copy_x[:item].strip()) | |
copy_x = copy_x[item:] | |
sentence_tray.append(copy_x.strip()) | |
if '' in sentence_tray: | |
sentence_tray.remove('') | |
return sentence_tray |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment