This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Creating feature columns from our categorical data | |
education1=tf.feature_column.categorical_column_with_hash_bucket("education",hash_bucket_size=16) | |
workclass1=tf.feature_column.categorical_column_with_hash_bucket("workclass",hash_bucket_size=10) | |
martial1=tf.feature_column.categorical_column_with_hash_bucket("marital_status",hash_bucket_size=7) | |
occupation1=tf.feature_column.categorical_column_with_hash_bucket("occupation",hash_bucket_size=14) | |
relationship1=tf.feature_column.categorical_column_with_hash_bucket("relationship",hash_bucket_size=6) | |
race1=tf.feature_column.categorical_column_with_hash_bucket("race",hash_bucket_size=5) | |
gender1=tf.feature_column.categorical_column_with_hash_bucket("gender",hash_bucket_size=2) | |
native_country1=tf.feature_column.categorical_column_with_hash_bucket("native_country",hash_bucket_size=60) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Doing a first cleaning of the texts | |
def clean_text(text): | |
text = text.lower() | |
text = re.sub(r"i'm", "i am", text) | |
text = re.sub(r"he's", "he is", text) | |
text = re.sub(r"she's", "she is", text) | |
text = re.sub(r"that's", "that is", text) | |
text = re.sub(r"what's", "what is", text) | |
text = re.sub(r"where's", "where is", text) | |
text = re.sub(r"\'ll", " will", text) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Creating a list of all of the conversations | |
conversations_ids = [] | |
for conversation in conversations[:-1]: | |
_conversation = conversation.split(' +++$+++ ')[-1][1:-1].replace("'", "").replace(" ", "") | |
conversations_ids.append(_conversation.split(',')) | |
# Getting separately the questions and the answers | |
questions = [] | |
answers = [] | |
for conversation in conversations_ids: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Sorting clean questions and answers by questions: | |
sorted_clean_questions=[] | |
sorted_clean_answers=[] | |
for i in range(1,25): | |
for question in enumerate(questions_int_sequence): | |
if len(question[1])==i: | |
sorted_clean_questions.append(questions_int_sequence[question[0]]) | |
sorted_clean_answers.append(answers_int_sequence[question[0]]) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Conveting questions and answers into sequence of integers | |
questions_int_sequence=[] | |
for question in Clean_questions: | |
int=[] | |
for word in question.split(): | |
if word not in dict_word2integer: | |
int.append("<OUT>") | |
else: | |
int.append(dict_word2integer[word]) | |
questions_int_sequence.append(int) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Adding EOS at end of every answer | |
new_clean_answers=[] | |
for answers in Clean_answers: | |
new_clean_answers.append(answers+" <EOS>") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Inverse Mapped Dictionary | |
dict_integer2word={i:w for w,i in dict_word2integer.items()} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Inverse Mapped Dictionary | |
dict_integer2word={i:w for w,i in dict_word2integer.items()} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#SETTING A THRESHOLD AND MAPPING EACH WORD TO A UNIQUE INTEGER | |
threshold=20 | |
word_number=0 | |
dict_word2integer={} | |
for word,frequency in word2count.items(): | |
if frequency>20: | |
dict_word2integer[word]=word_number | |
word_number+=1 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Adding tokens to our dictionary | |
Tokens=["<PAD>","<SOS>","<EOS>","<OUT>"] | |
for token in Tokens: | |
dict_integer[token]=len(dict_integer)+1 |
NewerOlder