Created
December 10, 2015 18:10
-
-
Save tmcgrath/f00b40c1d4a7dca9ff2b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Welcome to | |
____ __ | |
/ __/__ ___ _____/ /__ | |
_\ \/ _ \/ _ `/ __/ '_/ | |
/___/ .__/\_,_/_/ /_/\_\ version 1.1.0 | |
/_/ | |
Using Scala version 2.10.4 (Java HotSpot(TM) 64-Bit Server VM, Java 1.6.0_65) | |
Type in expressions to have them evaluated. | |
Type :help for more information. | |
2015-12-10 11:06:08.048 java[89392:1203] Unable to load realm info from SCDynamicStore | |
15/12/10 11:06:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable | |
Spark context available as sc. | |
scala> val babyNames = sc.textFile("baby_names.csv") | |
babyNames: org.apache.spark.rdd.RDD[String] = baby_names.csv MappedRDD[1] at textFile at <console>:12 | |
scala> val rows = babyNames.map(line => line.split(",")) | |
rows: org.apache.spark.rdd.RDD[Array[String]] = MappedRDD[2] at map at <console>:14 | |
scala> sc.parallelize(List(1,2,3)).flatMap(x=>List(x,x,x)).collect | |
res0: Array[Int] = Array(1, 1, 1, 2, 2, 2, 3, 3, 3) | |
scala> sc.parallelize(List(1,2,3)).map(x=>List(x,x,x)).collect | |
res1: Array[List[Int]] = Array(List(1, 1, 1), List(2, 2, 2), List(3, 3, 3)) | |
scala> | |
scala> sc.parallelize(List(1,2,3)).flatMap(x=>List(x,x,x)) | |
res2: org.apache.spark.rdd.RDD[Int] = FlatMappedRDD[8] at flatMap at <console>:13 | |
scala> sc.parallelize(List(1,2,3)).map(x=>List(x,x,x)) | |
res3: org.apache.spark.rdd.RDD[List[Int]] = MappedRDD[10] at map at <console>:13 | |
scala> val davidRows = rows.filter(row => row(1).contains("DAVID")) | |
davidRows: org.apache.spark.rdd.RDD[Array[String]] = FilteredRDD[11] at filter at <console>:16 | |
scala> val parallel = sc.parallelize(1 to 9, 3) | |
parallel: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[12] at parallelize at <console>:12 | |
scala> parallel.mapPartitions( x => List(x.next).iterator).collect | |
res4: Array[Int] = Array(1, 4, 7) | |
scala> val parallel = sc.parallelize(1 to 9) | |
parallel: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[14] at parallelize at <console>:12 | |
scala> parallel.mapPartitions( x => List(x.next).iterator).collect | |
res5: Array[Int] = Array(1, 2, 3, 4, 5, 6, 7, 8) | |
scala> parallel.mapPartitionsWithIndex( (index: Int, it: Iterator[Int]) => it.toList.map(x => index + ", "+x).iterator).collect | |
res6: Array[String] = Array(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 7, 9) | |
scala> val parallel = sc.parallelize(1 to 9, 3) | |
parallel: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[17] at parallelize at <console>:12 | |
scala> parallel.mapPartitionsWithIndex( (index: Int, it: Iterator[Int]) => it.toList.map(x => index + ", "+x).iterator).collect | |
res7: Array[String] = Array(0, 1, 0, 2, 0, 3, 1, 4, 1, 5, 1, 6, 2, 7, 2, 8, 2, 9) | |
scala> val parallel = sc.parallelize(1 to 9) | |
parallel: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[19] at parallelize at <console>:12 | |
scala> parallel.sample(true,.2).count | |
res8: Long = 2 | |
scala> parallel.sample(true,.2).count | |
res9: Long = 2 | |
scala> parallel.sample(true,.2).count | |
res10: Long = 2 | |
scala> parallel.sample(true,.2).count | |
res11: Long = 3 | |
scala> parallel.sample(true,.2).count | |
res12: Long = 2 | |
scala> parallel.sample(true,.2).count | |
res13: Long = 0 | |
scala> parallel.sample(true,.1) | |
res14: org.apache.spark.rdd.RDD[Int] = PartitionwiseSampledRDD[26] at sample at <console>:15 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment