Last active
November 6, 2018 10:13
-
-
Save Lanme/c85d7e272785d07179aeef326fd6fc76 to your computer and use it in GitHub Desktop.
some_simple_algorithm
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#forked from https://www.kaggle.com/CVxTz/keras-bidirectional-lstm-baseline-lb-0-069 | |
# This Python 3 environment comes with many helpful analytics libraries installed | |
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python | |
# For example, here's several helpful packages to load in | |
import numpy as np # linear algebra | |
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) | |
# Input data files are available in the "../input/" directory. | |
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory | |
from subprocess import check_output | |
print(check_output(["ls", "../input"]).decode("utf8")) | |
# Any results you write to the current directory are saved as output. | |
from keras.models import Model | |
from keras.layers import Dense, Embedding, Input | |
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout | |
from keras.preprocessing import text, sequence | |
from keras.callbacks import EarlyStopping, ModelCheckpoint | |
max_features = 20000 | |
maxlen = 100 | |
train = pd.read_csv("../input/train.csv") | |
test = pd.read_csv("../input/test.csv") | |
train = train.sample(frac=1) | |
list_sentences_train = train["comment_text"].fillna("CVxTz").values | |
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"] | |
y = train[list_classes].values | |
list_sentences_test = test["comment_text"].fillna("CVxTz").values | |
tokenizer = text.Tokenizer(num_words=max_features) | |
tokenizer.fit_on_texts(list(list_sentences_train)) | |
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train) | |
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test) | |
X_t = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen) | |
X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen) | |
def get_model(): | |
embed_size = 128 | |
inp = Input(shape=(maxlen, )) | |
x = Embedding(max_features, embed_size)(inp) | |
x = Bidirectional(LSTM(50, return_sequences=True))(x) | |
x = GlobalMaxPool1D()(x) | |
x = Dropout(0.1)(x) | |
x = Dense(50, activation="relu")(x) | |
x = Dropout(0.1)(x) | |
x = Dense(6, activation="sigmoid")(x) | |
model = Model(inputs=inp, outputs=x) | |
model.compile(loss='binary_crossentropy', | |
optimizer='adam', | |
metrics=['accuracy']) | |
return model | |
model = get_model() | |
batch_size = 32 | |
epochs = 2 | |
file_path="weights_base.best.hdf5" | |
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min') | |
early = EarlyStopping(monitor="val_loss", mode="min", patience=20) | |
callbacks_list = [checkpoint, early] #early | |
model.fit(X_t, y, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list) | |
model.load_weights(file_path) | |
y_test = model.predict(X_te) | |
sample_submission = pd.read_csv("../input/sample_submission.csv") | |
sample_submission[list_classes] = y_test | |
sample_submission.to_csv("baseline.csv", index=False) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
本文主要使用一维卷积核的CNN进行文本分类(二维卷积主要用于图像处理) | |
步骤 1:测试文本的预处理,分词->去去除停用词->统计选择top 5000的词做为特征词 | |
步骤 2:为每个特征词生成ID | |
步骤 3:将文本转化成ID序列,并将左侧补齐 | |
步骤 4:训练集shuffle | |
步骤 5:Embedding Layer 将词转化为词向量 | |
步骤 6:添加Conv1D卷积层 | |
步骤 7:添加池化层 | |
步骤 7:添加全连接层,loss function = binary_crossentropy | |
步骤 8:输出层使用Sigmoid | |
from:https://blog.csdn.net/xiewenbo/article/details/77874080 | |
""" | |
from __future__ import print_function | |
from keras.preprocessing import sequence | |
from keras.models import Sequential | |
from keras.layers import Dense, Dropout, Activation | |
from keras.layers import Embedding | |
from keras.layers import Conv1D, GlobalMaxPooling1D | |
from keras.datasets import imdb | |
from keras.models import model_from_json | |
import numpy as np | |
# set parameters: | |
max_features = 5001 | |
maxlen = 100 | |
batch_size = 32 | |
embedding_dims = 50 | |
filters = 250 | |
kernel_size = 3 | |
hidden_dims = 250 | |
epochs = 10 | |
x_train=np.loadtxt("x_train.txt",dtype=int) | |
y_train=np.loadtxt("y_train.txt",dtype=int) | |
indices = np.arange(x_train.shape[0]) | |
np.random.shuffle(indices) | |
x_train = x_train[indices] | |
y_train = y_train[indices] | |
print('Loading data...') | |
#x_train=np.loadtxt("x_train.txt",dtype=int) | |
#y_train=np.loadtxt("y_train.txt",dtype=int) | |
x_test=x_train[20000:] | |
y_test=y_train[20000:] | |
x_train=x_train[:20000] | |
y_train=y_train[:20000] | |
#x_test=x_train | |
#y_test=y_train | |
print(len(x_train), 'train sequences') | |
print(len(x_test), 'test sequences') | |
print(x_train[:1]) | |
print('Pad sequences (samples x time)') | |
x_train = sequence.pad_sequences(x_train, maxlen=maxlen) | |
x_test = sequence.pad_sequences(x_test, maxlen=maxlen) | |
print('x_train shape:', x_train.shape) | |
print('x_test shape:', x_test.shape) | |
print('Build model...') | |
model = Sequential() | |
# we start off with an efficient embedding layer which maps | |
# our vocab indices into embedding_dims dimensions | |
model.add(Embedding(max_features, | |
embedding_dims, | |
input_length=maxlen)) | |
model.add(Dropout(0.5)) | |
# we add a Convolution1D, which will learn filters | |
# word group filters of size filter_length: | |
model.add(Conv1D(filters, | |
kernel_size, | |
padding='valid', | |
activation='relu', | |
strides=1)) | |
# we use max pooling: | |
model.add(GlobalMaxPooling1D()) | |
# We add a vanilla hidden layer: | |
model.add(Dense(hidden_dims)) | |
model.add(Dropout(0.5)) | |
model.add(Activation('relu')) | |
# We project onto a single unit output layer, and squash it with a sigmoid: | |
model.add(Dense(1)) | |
model.add(Activation('sigmoid')) | |
model.compile(loss='binary_crossentropy', | |
optimizer='adam', | |
metrics=['accuracy']) | |
model.fit(x_train, y_train, | |
batch_size=batch_size, | |
epochs=epochs, | |
validation_data=(x_test, y_test)) | |
model_json = model.to_json() | |
with open("model.json", "w") as json_file: | |
json_file.write(model_json) | |
model.save_weights("model.h5") | |
json_file = open('model.json', 'r') | |
loaded_model_json = json_file.read() | |
json_file.close() | |
loaded_model = model_from_json(loaded_model_json) | |
# load weights into new model | |
loaded_model.load_weights("model.h5") | |
print("Loaded model from disk") | |
loaded_model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy']) | |
score = loaded_model.evaluate(x_test, y_test, verbose=0) | |
print(score) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# In[1]: | |
import tensorflow as tf | |
from tensorflow.examples.tutorials.mnist import input_data | |
# In[2]: | |
#载入数据集 | |
mnist = input_data.read_data_sets("MNIST_data/",one_hot=True) | |
# 输入图片是28*28 | |
n_inputs = 28 #输入一行,一行有28个数据 | |
max_time = 28 #一共28行 | |
lstm_size = 500 #隐层单元 | |
n_classes = 10 # 10个分类 | |
batch_size = 50 #每批次50个样本 | |
n_batch = mnist.train.num_examples // batch_size #计算一共有多少个批次 | |
print(n_batch) | |
#这里的none表示第一个维度可以是任意的长度 | |
x = tf.placeholder(tf.float32,[None,784]) | |
#正确的标签 | |
y = tf.placeholder(tf.float32,[None,10]) | |
#初始化权值 | |
weights = tf.Variable(tf.truncated_normal([lstm_size, n_classes], stddev=0.1)) | |
#初始化偏置值 | |
biases = tf.Variable(tf.constant(0.1, shape=[n_classes])) | |
#定义RNN网络 | |
def RNN(X,weights,biases): | |
# inputs=[batch_size, max_time, n_inputs] | |
inputs = tf.reshape(X,[-1,max_time,n_inputs]) | |
#定义LSTM基本CELL | |
lstm_cell = tf.contrib.rnn.core_rnn_cell.BasicLSTMCell(lstm_size) | |
# final_state[0]是cell state | |
# final_state[1]是hidden_state 最后500个隐藏单元的输出结果, | |
# output 与time.major 如果是false 返回是 batch_size=50次, maxtime=28的长度,cell.output_size:500个隐藏单元 | |
# 但是time是 0到27,则cell.output_size 则表示对应时间的500个输出结果。 | |
# finale_state返回: | |
# state【】 包括 cell_state 中间的celll, | |
# hidden_state 最后的结果输出。 | |
# batch_size 50次 | |
# state_size: 隐藏单元个数 500 | |
# 隐藏层的单元个数, (batch次数,个数,数据数),格式 | |
outputs,final_state = tf.nn.dynamic_rnn(lstm_cell,inputs,dtype=tf.float32) | |
print(outputs[2]) | |
results = tf.nn.softmax(tf.matmul(final_state[1],weights) + biases) | |
return results | |
#计算RNN的返回结果 | |
prediction= RNN(x, weights, biases) | |
#损失函数 | |
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction,labels=y)) | |
#使用AdamOptimizer进行优化 | |
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy) | |
#结果存放在一个布尔型列表中 | |
correct_prediction = tf.equal(tf.argmax(y,1),tf.argmax(prediction,1))#argmax返回一维张量中最大的值所在的位置 | |
#求准确率 | |
accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32))#把correct_prediction变为float32类型 | |
#初始化 | |
init = tf.global_variables_initializer() | |
saver = tf.train.Saver() | |
with tf.Session() as sess: | |
sess.run(init) | |
for epoch in range(6): | |
for batch in range(n_batch): | |
batch_xs,batch_ys = mnist.train.next_batch(batch_size) | |
sess.run(train_step,feed_dict={x:batch_xs,y:batch_ys}) | |
acc = sess.run(accuracy,feed_dict={x:mnist.test.images,y:mnist.test.labels}) | |
print ("Iter " + str(epoch) + ", Testing Accuracy= " + str(acc)) | |
saver.save(sess,'net/my_net.ckpt') | |
# In[ ]: |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment