Last active
September 8, 2017 11:55
-
-
Save ardamavi/1154a2864f6f84e03e4ba66a4b8b0b78 to your computer and use it in GitHub Desktop.
Sentence to Tensor
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Arda Mavi | |
import string | |
import numpy as np | |
characters = string.printable # All printable characters. | |
token_index = dict(zip(range(0, len(characters)), characters)) | |
max_word = 140 | |
max_length = 80 | |
char_len = 100 #len(token_index) | |
sentence_array = np.zeros((max_word, max_length, char_len)) | |
def encode(sentence): | |
this_sentence_array = np.array(sentence_array, copy=True) | |
for i, word in enumerate(sentence.split(' ')): | |
for j, char in enumerate(word): | |
index = characters.index(char) | |
this_sentence_array[i, j, index] = 1. | |
return this_sentence_array | |
def decode(array): | |
this_sentence = '' | |
for i, word in enumerate(array): | |
if word[0].nonzero()[0].size == 0: | |
break | |
for j, char_list in enumerate(word): | |
char_index = char_list.nonzero()[0] | |
if char_index.size == 0: | |
break | |
char = token_index.get(char_index[0]) | |
this_sentence += char | |
this_sentence += ' ' | |
return this_sentence[:-1] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment