jovianlin · April 17, 2017 06:27
diff --git a/pyspark_threads.py b/pyspark_threads.py
 # Soure http://stackoverflow.com/questions/30214474/how-to-run-multiple-jobs-in-one-sparkcontext-from-separate-threads-in-pyspark
 # Prereqs:
 # set 
 # spark.dynamicAllocation.enabled         true
 # spark.shuffle.service.enabled           true
 # in spark-defaults.conf

 import threading
 from pyspark import SparkContext, SparkConf

 def task(sc, i):
  print sc.parallelize(range(i*10000)).count()

 def run_multiple_jobs():
  conf = SparkConf().setMaster('local[*]').setAppName('appname')
  # Set scheduler to FAIR: http://spark.apache.org/docs/latest/job-scheduling.html#scheduling-within-an-application
  conf.set('spark.scheduler.mode', 'FAIR')
  sc = SparkContext(conf=conf)
  for i in range(4):
    t = threading.Thread(target=task, args=(sc, i))
    t.start()
    print 'spark task', i, 'has started'


 run_multiple_jobs()

 # OUTPUT:
 # spark task 0 has started
 # spark task 1 has started
 # spark task 2 has started
 # spark task 3 has started
 # 30000
 # 0 
 # 10000
 # 20000
	# Soure http://stackoverflow.com/questions/30214474/how-to-run-multiple-jobs-in-one-sparkcontext-from-separate-threads-in-pyspark
	# Prereqs:
	# set
	# spark.dynamicAllocation.enabled true
	# spark.shuffle.service.enabled true
	# in spark-defaults.conf

	import threading
	from pyspark import SparkContext, SparkConf

	def task(sc, i):
	print sc.parallelize(range(i*10000)).count()

	def run_multiple_jobs():
	conf = SparkConf().setMaster('local[*]').setAppName('appname')
	# Set scheduler to FAIR: http://spark.apache.org/docs/latest/job-scheduling.html#scheduling-within-an-application
	conf.set('spark.scheduler.mode', 'FAIR')
	sc = SparkContext(conf=conf)
	for i in range(4):
	t = threading.Thread(target=task, args=(sc, i))
	t.start()
	print 'spark task', i, 'has started'


	run_multiple_jobs()

	# OUTPUT:
	# spark task 0 has started
	# spark task 1 has started
	# spark task 2 has started
	# spark task 3 has started
	# 30000
	# 0
	# 10000
	# 20000