本文共 2407 字,大约阅读时间需要 8 分钟。
#flatMapval favMovies = sc.parallelize(List("Pulp Fiction","Requiem for a dream","Aclockwork Orange"));favMovies.flatMap(movieTitle=>movieTitle.split(" ")).collect()
#sampleval data = sc.parallelize(List(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20));data.sample(true,0.1,12345).collect()
# distinct 去除重复数据val movieList = sc.parallelize(List("A Nous Liberte","Airplane","The Apartment","The Apartment"))movieList.distinct().collect()
# intersection 取交集# Array[String] = Array(Tom Mahoney)val java_skills=sc.parallelize(List("Tom Mahoney","Alicia Whitekar","PaulJones","Rodney Marsh"))val db_skills= sc.parallelize(List("James Kent","Paul Jones","Tom Mahoney","Adam Waugh"))java_skills.intersection(db_skills).collect()
# union 注意有2个1# Array[String] = Array(1, 2, 3, 1, 5, 6)val java_skills=sc.parallelize(List("1","2","3"))val db_skills= sc.parallelize(List("1","5","6"))java_skills.union(db_skills).collect()
# subtract# Alicia Whitekar, Rodney Marshval java_skills=sc.parallelize(List("Tom Mahoney","Alicia Whitekar","Paul Jones","Rodney Marsh"))val db_skills= sc.parallelize(List("James Kent","Paul Jones","Tom Mahoney","Adam Waugh"))
#cartesianval months = sc.parallelize(List("Jan","Feb","Mar","Apr","May","Jun"))val years = sc.parallelize(List(2010,2011,2012,2013,2014,2015))var yc = years.cartesian(months)# 36yc.count()# Array[(Int, String)] = Array((2010,Jan), (2010,Feb), (2010,Mar), (2011,Jan), (2011,Feb), # (2011,Mar), (2012,Jan), (2012,Feb), (2012,Mar), (2010,Apr), (2010,May), (2010,Jun), # (2011,Apr), (2011,May), (2011,Jun), (2012,Apr), (2012,May), (2012,Jun), (2013,Jan), # (2013,Feb), (2013,Mar), (2014,Jan), (2014,Feb), (2014,Mar), (2015,Jan), (2015,Feb), # (2015,Mar), (2013,Apr), (2013,May), (2013,Jun), (2014,Apr), (2014,May), (2014,Jun), # (2015,Apr), (2015,May), (2015,Jun))yc.take(36)
collect()count()take(n)first() = take(1)saveAsTextFile(path)saveAsSequenceFile(path)saveAsObjectFile(path)foreach(func)
#groupByKeyval storeSales = sc.parallelize(Array(("London",23.4),("Manchester",19.8),("Leeds",14.7),("London",26.6)))storeSales.groupByKey().map(location=>(location._1,location._2.sum)).collect()#reduceByKeystoreSales.reduceByKey(_+_).collect()
#saveAsTextFileval dataFile = sc.textFile("README.md")//Split line to words, and flatten the result of each splitval words = dataFile.flatMap(line => line.split(" "))//Save to textFilewords.saveAsTextFile("/tmp/scalawords/")
转载地址:http://lpawb.baihongyu.com/