Created
June 6, 2020 01:07
-
-
Save Sanix-Darker/ed0d5ab0eb7ac41a174e5d97bb7682e1 to your computer and use it in GitHub Desktop.
[PYTHON]PLAGON.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ____ _ _ ____ ___ _ _ | |
# | _ \| | / \ / ___|/ _ \| \ | | | |
# | |_) | | / _ \| | _| | | | \| | | |
# | __/| |___ / ___ \ |_| | |_| | |\ | | |
# |_| |_____/_/ \_\____|\___/|_| \_| | |
# -------------------------------------- | |
from os import listdir as os_listdir, path as os_path | |
# pip install -U scikit-learn | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
def vectorize(Text): | |
""" | |
""" | |
return TfidfVectorizer().fit_transform(Text).toarray() | |
def similarity(doc1, doc2): | |
""" | |
""" | |
return cosine_similarity([doc1, doc2]) | |
def loop_comparaison(file_x, text_vector_x, new_vectors, results): | |
""" | |
""" | |
for file_y, text_vector_y in new_vectors: | |
sim_score = similarity(text_vector_x, text_vector_y)[0][1] | |
student_pair = sorted((file_x, file_y)) | |
score = (student_pair[0], student_pair[1],sim_score) | |
results.add(score) | |
return results | |
def plagon_core(list_of_files, contents_of_files): | |
""" | |
""" | |
results = set() | |
vectors = vectorize(contents_of_files) | |
s_vectors = list(zip(list_of_files, vectors)) | |
for file_x, text_vector_x in s_vectors: | |
new_vectors = s_vectors.copy() | |
new_index = new_vectors.index((file_x, text_vector_x)) | |
del new_vectors[new_index] | |
results = loop_comparaison(file_x, text_vector_x, new_vectors, results) | |
return results | |
if __name__ == "__main__": | |
dir_ = "/home/d4rk3r/ACTUALC/vagrant/PYTHON/github/test_plagiat" | |
list_of_files = [os_path.join(dir_, doc) for doc in os_listdir(dir_) if doc.endswith('.txt')] | |
contents_of_files =[open(File).read() for File in list_of_files] | |
for data in plagon_core(list_of_files, contents_of_files): | |
print(data) | |
# results: | |
# ['tt2.txt', 'tt3.txt'] => 0.050632398572142946 (5%) | |
# ['tt.txt', 'tt3.txt'] => 0.06448929199938869) (6%) | |
# ['tt.txt', 'tt2.txt'] => 0.7492151128400741) (74%) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment