Created
July 12, 2025 07:59
-
-
Save quicksilver0/39c9ec0ab3d58e77bb01f04b36d22426 to your computer and use it in GitHub Desktop.
Finding identical matches in text
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import difflib | |
#Finding identical matches in text with number of characters >=5. Min length should be adjusted for real case. | |
def find_overlaps(text1, text2, min_length=5): | |
matcher = difflib.SequenceMatcher(None, text1, text2) | |
matches = [] | |
for match in matcher.get_matching_blocks(): | |
if match.size >= min_length: | |
overlap = text1[match.a: match.a + match.size] | |
matches.append(overlap) | |
return matches | |
#Example usage: | |
text1 = "The quick brown fox jumps over the lazy dog" | |
text2 = "A quick brown fox leaped over the lazy hound" | |
print(find_overlaps(text1, text2, min_length=5)) | |
# Result: | |
# [' quick brown fox ', ' over the lazy '] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment