Skip to content

Instantly share code, notes, and snippets.

@hhbyyh
Created January 19, 2018 20:08
Show Gist options
  • Save hhbyyh/eb99c55cdda5294e0fad588b2ffbfafb to your computer and use it in GitHub Desktop.
Save hhbyyh/eb99c55cdda5294e0fad588b2ffbfafb to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"kernelspec": {
"language": "python",
"display_name": "Python 2 with Spark 2.1",
"name": "python2-spark21"
},
"language_info": {
"version": "2.7.11",
"file_extension": ".py",
"mimetype": "text/x-python",
"nbconvert_exporter": "python",
"name": "python",
"pygments_lexer": "ipython2",
"codemirror_mode": {
"version": 2,
"name": "ipython"
}
}
},
"cells": [
{
"execution_count": 1,
"cell_type": "code",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "--2018-01-19 13:23:16-- https://repo1.maven.org/maven2/com/intel/analytics/bigdl/bigdl-SPARK_2.1/0.3.0/bigdl-SPARK_2.1-0.3.0-jar-with-dependencies.jar\nResolving repo1.maven.org (repo1.maven.org)... 151.101.48.209\nConnecting to repo1.maven.org (repo1.maven.org)|151.101.48.209|:443... connected.\nHTTP request sent, awaiting response... 200 OK\nLength: 77227587 (74M) [application/java-archive]\nSaving to: \u2018bigdl-SPARK_2.1-0.3.0-jar-with-dependencies.jar.4\u2019\n\n100%[======================================>] 77,227,587 66.3MB/s in 1.1s \n\n2018-01-19 13:23:18 (66.3 MB/s) - \u2018bigdl-SPARK_2.1-0.3.0-jar-with-dependencies.jar.4\u2019 saved [77227587/77227587]\n\n"
}
],
"source": "!(export sv=2.1 bv=0.3.0 ; cd ~/data/libs/ && wget https://repo1.maven.org/maven2/com/intel/analytics/bigdl/bigdl-SPARK_${sv}/${bv}/bigdl-SPARK_${sv}-${bv}-jar-with-dependencies.jar)"
},
{
"execution_count": 2,
"cell_type": "code",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Requirement already satisfied: bigdl==0.3.0 in /gpfs/global_fs01/sym_shared/YPProdSpark/user/s95e-7e9e9b7c0b9bb4-b210673fb348/.local/lib/python2.7/site-packages\nRequirement already satisfied: pyspark>=2.2 in /gpfs/global_fs01/sym_shared/YPProdSpark/user/s95e-7e9e9b7c0b9bb4-b210673fb348/.local/lib/python2.7/site-packages (from bigdl==0.3.0)\nRequirement already satisfied: numpy>=1.7 in /usr/local/src/bluemix_jupyter_bundle.v77/notebook/lib/python2.7/site-packages (from bigdl==0.3.0)\nRequirement already satisfied: py4j==0.10.4 in /gpfs/global_fs01/sym_shared/YPProdSpark/user/s95e-7e9e9b7c0b9bb4-b210673fb348/.local/lib/python2.7/site-packages (from pyspark>=2.2->bigdl==0.3.0)\n"
}
],
"source": "!pip install bigdl==0.3.0 | cat"
},
{
"execution_count": 1,
"cell_type": "code",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Prepending /gpfs/fs01/user/s95e-7e9e9b7c0b9bb4-b210673fb348/.local/lib/python2.7/site-packages/bigdl/share/conf/spark-bigdl.conf to sys.path\n"
},
{
"output_type": "stream",
"name": "stderr",
"text": "/gpfs/fs01/user/s95e-7e9e9b7c0b9bb4-b210673fb348/.local/lib/python2.7/site-packages/bigdl/util/engine.py:39: UserWarning: Find both SPARK_HOME and pyspark. You may need to check whether they match with each other. SPARK_HOME environment variable is set to: /usr/local/src/spark21master/spark-2.1.2-bin-2.7.3, and pyspark is found in: /gpfs/fs01/user/s95e-7e9e9b7c0b9bb4-b210673fb348/.local/lib/python2.7/site-packages/pyspark/__init__.pyc. If they are unmatched, please use one source only to avoid conflict. For example, you can unset SPARK_HOME and use pyspark only.\n warnings.warn(warning_msg)\n"
}
],
"source": "from bigdl.nn.layer import *\nfrom bigdl.nn.criterion import *\nfrom bigdl.util.common import *\nfrom pyspark import SparkContext\nimport numpy as np"
},
{
"execution_count": 2,
"cell_type": "code",
"metadata": {},
"outputs": [],
"source": "sc.stop()\nconfCore=create_spark_conf()\nconfCore.set(\"spark.executor.cores\", 1)\nconfCore.set(\"spark.cores.max\", 1)\nsc = SparkContext(appName=\"Mnist\", conf=confCore)\ninit_engine()"
},
{
"execution_count": 3,
"cell_type": "code",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "creating: createLinear\n{u'Linearaa5ab3b4': {u'gradWeight': array([[ 0., 0.]], dtype=float32), u'bias': array([ 0.53909093], dtype=float32), u'weight': array([[ 0.22592682, -0.69651681]], dtype=float32), u'gradBias': array([ 0.], dtype=float32)}}\n"
}
],
"source": "linear = Linear(2, 1)\nprint (linear.parameters())"
},
{
"execution_count": 4,
"cell_type": "code",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "('Extracting', 'train-images-idx3-ubyte.gz')\n('Extracting', 'train-labels-idx1-ubyte.gz')\n('Extracting', 't10k-images-idx3-ubyte.gz')\n('Extracting', 't10k-labels-idx1-ubyte.gz')\ncreating: createSequential\ncreating: createReshape\ncreating: createSpatialConvolution\ncreating: createTanh\ncreating: createSpatialMaxPooling\ncreating: createTanh\ncreating: createSpatialConvolution\ncreating: createSpatialMaxPooling\ncreating: createReshape\ncreating: createLinear\ncreating: createTanh\ncreating: createLinear\ncreating: createLogSoftMax\ncreating: createClassNLLCriterion\ncreating: createDefault\ncreating: createSGD\ncreating: createMaxEpoch\ncreating: createOptimizer\ncreating: createEveryEpoch\ncreating: createTop1Accuracy\ntraining finished\n"
}
],
"source": "from optparse import OptionParser\nfrom bigdl.dataset import mnist\nfrom bigdl.dataset.transformer import *\nfrom bigdl.nn.layer import *\nfrom bigdl.nn.criterion import *\nfrom bigdl.optim.optimizer import *\nfrom bigdl.util.common import *\n\n\ndef build_model(class_num):\n model = Sequential()\n model.add(Reshape([1, 28, 28]))\n model.add(SpatialConvolution(1, 6, 5, 5))\n model.add(Tanh())\n model.add(SpatialMaxPooling(2, 2, 2, 2))\n model.add(Tanh())\n model.add(SpatialConvolution(6, 12, 5, 5))\n model.add(SpatialMaxPooling(2, 2, 2, 2))\n model.add(Reshape([12 * 4 * 4]))\n model.add(Linear(12 * 4 * 4, 100))\n model.add(Tanh())\n model.add(Linear(100, class_num))\n model.add(LogSoftMax())\n return model\n\n\ndef get_mnist(sc, data_type=\"train\", location=\"/tmp/mnist\"):\n \"\"\"\n Get and normalize the mnist data. We would download it automatically\n if the data doesn't present at the specific location.\n :param sc: SparkContext\n :param data_type: training data or testing data\n :param location: Location storing the mnist\n :return: A RDD of (features: Ndarray, label: Ndarray)\n \"\"\"\n (images, labels) = mnist.read_data_sets(location, data_type)\n images = sc.parallelize(images)\n labels = sc.parallelize(labels + 1) # Target start from 1 in BigDL\n record = images.zip(labels)\n return record\n\ndef get_end_trigger():\n return MaxEpoch(10)\n\ntrain_data = get_mnist(sc, \"train\", \"\")\\\n .map(lambda rec_tuple: (normalizer(rec_tuple[0], mnist.TRAIN_MEAN, mnist.TRAIN_STD),\n rec_tuple[1]))\\\n .map(lambda t: Sample.from_ndarray(t[0], t[1]))\ntest_data = get_mnist(sc, \"test\", \"\")\\\n .map(lambda rec_tuple: (normalizer(rec_tuple[0], mnist.TEST_MEAN, mnist.TEST_STD),\n rec_tuple[1]))\\\n .map(lambda t: Sample.from_ndarray(t[0], t[1]))\noptimizer = Optimizer(\n model=build_model(10),\n training_rdd=train_data,\n criterion=ClassNLLCriterion(),\n optim_method=SGD(learningrate=0.01, learningrate_decay=0.0002),\n end_trigger=get_end_trigger(),\n batch_size=128)\noptimizer.set_validation(\n batch_size=128,\n val_rdd=test_data,\n trigger=EveryEpoch(),\n val_method=[Top1Accuracy()]\n)\ntrained_model = optimizer.optimize()\nparameters = trained_model.parameters()\nprint(\"training finished\")"
},
{
"execution_count": 6,
"cell_type": "code",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "creating: createTop1Accuracy\nEvaluated result: 0.951300024986, total_num: 10000, method: Top1Accuracy\n"
}
],
"source": "\nresults = trained_model.evaluate(test_data, 128, [Top1Accuracy()])\nfor result in results:\n print(result)"
},
{
"execution_count": null,
"cell_type": "code",
"metadata": {},
"outputs": [],
"source": ""
}
],
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment