Monday 23 September 2019

spark_hadoop_commands

===================================================================
Hadoop Spark Examples using scala
===================================================================

val key = sc.parallelize(List(1,2,3,4))

val value = sc.parallelize(List("a","b","c","d"))

val mapRdd = key.zip(value)

val tabData = mapRdd.map(x => x._1 + "\t" + x._2)

val tupleData = mapRdd.map(x => (x._1, x._2))


tabData.saveAsTextFile("/home/orienit/spark/output/tabData")

tabData.saveAsTextFile("file:///home/orienit/spark/output/tabData")


tupleData.saveAsSequenceFile("/home/orienit/spark/output/tupleData")

tupleData.saveAsSequenceFile("file:///home/orienit/spark/output/tupleData")




import org.apache.hadoop.conf._

import org.apache.hadoop.io._


import org.apache.hadoop.mapred._


import org.apache.hadoop.mapreduce.lib.input._

import org.apache.hadoop.mapreduce.lib.output._




==============================================
Store data into Hadoop using Hadoop Api's
==============================================


// for text format only


val data_1 = mapRdd.map(x => (x._1, x._2))



(or)

 // any other formats //


val data_1 = mapRdd.map(x => (new LongWritable(x._1), new Text(x._2)))


val conf = new Configuration()


data_1.saveAsHadoopFile("file:///home/orienit/spark/output/data_1", classOf[LongWritable], classOf[Text], classOf[org.apache.hadoop.mapred.TextOutputFormat[LongWritable, Text]]);



data_1.saveAsHadoopFile("file:///home/orienit/spark/output/tupleData_1", classOf[LongWritable], classOf[Text], classOf[org.apache.hadoop.mapred.SequenceFileOutputFormat[LongWritable, Text]]);
data_1.saveAsNewAPIHadoopFile("file:///home/orienit/spark/output/data_2", classOf[LongWritable], classOf[Text], classOf[org.apache.hadoop.mapreduce.lib.output.TextOutputFormat[LongWritable, Text]], conf);



data_1.saveAsNewAPIHadoopFile("file:///home/orienit/spark/output/tupleData_2", classOf[LongWritable], classOf[Text], classOf[org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat[LongWritable, Text]], conf);





==============================================
Load data from Hadoop using Hadoop Api's
==============================================




val data = sc.textFile("/home/orienit/spark/output/data").map(x => (x.split("\t")(0),x.split("\t")(1)))
data.collect


Hadoop Old Api:
=====================


val hadoop1 = sc.hadoopFile[Text, Text, org.apache.hadoop.mapred.KeyValueTextInputFormat]("/home/orienit/spark/output/data").map{ case (x, y) => (x.toString, y.toString) }
hadoop1.collect


val hadoop11 = sc.hadoopFile[LongWritable, Text, org.apache.hadoop.mapred.TextInputFormat]("/home/orienit/spark/output/data").map{ case (x, y) => (y.toString.split("\t")(0),y.toString.split("\t")(1)) }
hadoop11.collect


val hadoop1_seq = sc.hadoopFile[IntWritable, Text, org.apache.hadoop.mapred.SequenceFileInputFormat[IntWritable, Text]]("/home/orienit/spark/output/tupleData").map{ case (x, y) => (x.toString, y.toString) }

(or)

val hadoop1_seq = sc.hadoopFile("/home/orienit/spark/output/tupleData",classOf[org.apache.hadoop.mapred.SequenceFileInputFormat[IntWritable,Text]],classOf[IntWritable], classOf[Text]).map{ case (x, y) => (x.toString, y.toString) }

hadoop1_seq.collect



Hadoop New Api:
=====================

val hadoop2 = sc.newAPIHadoopFile[Text, Text, org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat]("/home/orienit/spark/output/data").map{ case (x, y) => (x.toString, y.toString) }
hadoop2.collect
val hadoop22 = sc.newAPIHadoopFile[LongWritable, Text, org.apache.hadoop.mapreduce.lib.input.TextInputFormat]("/home/orienit/spark/output/data").map{ case (x, y) => (y.toString.split("\t")(0),y.toString.split("\t")(1)) }
hadoop22.collect

val hadoop2_seq = sc.newAPIHadoopFile[IntWritable, Text, org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat[IntWritable, Text]]("/home/orienit/spark/output/tupleData").map{ case (x, y) => (x.toString, y.toString) }


(or)


val hadoop2_seq = sc.newAPIHadoopFile("/home/orienit/spark/output/tupleData", classOf[org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat[IntWritable, Text]], classOf[IntWritable], classOf[Text]).map{ case (x, y) => (x.toString, y.toString) }

hadoop2_seq.collect






Frequent Calls in Spark:
==============================

rdd.saveAsHadoopFile(outputpath, classOf[Key], classOf[Value], classOf[OutputFormat[Key,Value]]);


rdd.saveAsNewAPIHadoopFile(outputpath, classOf[Key], classOf[Value], classOf[OutputFormat[Key,Value]]);



val rdd = sc.hadoopFile[Key, Value, InputFormat[Key,Value]](inputpath)

val rdd = sc.hadoopFile(inputpath, classOf[InputFormat[Key,Value]], classOf[Key], classOf[Value])


val rdd = sc.newAPIHadoopFile[Key, Value, InputFormat[Key,Value]](inputpath)

val rdd = sc.newAPIHadoopFile(inputpath, classOf[InputFormat[Key,Value]], classOf[Key], classOf[Value])



val rdd = sc.textFile(inputpath)
val rdd = sc.parallelize(Object)

rdd.saveAsTextFile(outputpath)
rdd.saveAsSequenceFile(outputpath)
rdd.saveAsObjectFile(outputpath)











===================================================================
Hadoop Spark Examples using python
===================================================================


key = sc.parallelize([1,2,3,4])
value = sc.parallelize(["a","b","c","d"])
map = key.zip(value)

data = mapRdd.map(lambda x : "\t".join([str(x[0]), x[1]]))

tupleData = mapRdd.map(lambda x : (x[0], x[1]))


data.saveAsTextFile("/home/orienit/spark/output/data")

data.saveAsTextFile("file:///home/orienit/spark/output/data")


tupleData.saveAsSequenceFile("/home/orienit/spark/output/tupleData")

tupleData.saveAsSequenceFile("file:///home/orienit/spark/output/tupleData")





==============================================
Store data into Hadoop using Hadoop Api's
==============================================



// for text format only


data_1 = mapRdd.map(lambda x : (x[0], x[1]))


(or)


// any other formats

data_1 = mapRdd.map(lambda x : (long(x[0]), x[1]))


data_1.saveAsHadoopFile("file:///home/orienit/spark/output/data_1","org.apache.hadoop.mapred.TextOutputFormat",
"org.apache.hadoop.io.LongWritable", "org.apache.hadoop.io.Text");


data_1.saveAsHadoopFile("file:///home/orienit/spark/output/tupleData_1",
"org.apache.hadoop.mapred.SequenceFileOutputFormat",
"org.apache.hadoop.io.LongWritable",
"org.apache.hadoop.io.Text");



data_1.saveAsNewAPIHadoopFile("file:///home/orienit/spark/output/data_2","org.apache.hadoop.mapreduce.lib.output.TextOutputFormat",
"org.apache.hadoop.io.LongWritable",
"org.apache.hadoop.io.Text");


data_1.saveAsNewAPIHadoopFile("file:///home/orienit/spark/output/tupleData_2",
"org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat",
"org.apache.hadoop.io.LongWritable",
"org.apache.hadoop.io.Text");




==============================================
Load data from Hadoop using Hadoop Api's
==============================================



data = sc.textFile("/home/orienit/spark/output/data").map(lambda x : (x.split("\t")[0],x.split("\t")[1]))

data.collect()



Hadoop Old Api:
=====================

hadoop1 = sc.hadoopFile("file:///home/orienit/spark/output/data",
"org.apache.hadoop.mapred.KeyValueTextInputFormat",
"org.apache.hadoop.io.Text",
"org.apache.hadoop.io.Text").map(lambda x: (str(x[0]), str(x[1])))
hadoop1.collect()



hadoop11 = sc.hadoopFile("file:///home/orienit/spark/output/data",
"org.apache.hadoop.mapred.TextInputFormat","org.apache.hadoop.io.LongWritable",
"org.apache.hadoop.io.Text").map(lambda x : (long(x[1].split("\t")[0]), str(x[1].split("\t")[1])))
hadoop11.collect()



hadoop1_seq = sc.hadoopFile("file:///home/orienit/spark/output/tupleData",
"org.apache.hadoop.mapred.SequenceFileInputFormat",
"org.apache.hadoop.io.LongWritable",
"org.apache.hadoop.io.Text").map(lambda x: (long(x[0]), str(x[1])))
hadoop1_seq.collect()





Hadoop New Api:
=====================


hadoop2 = sc.newAPIHadoopFile("file:///home/orienit/spark/output/data",
"org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat","org.apache.hadoop.io.Text",
"org.apache.hadoop.io.Text").map(lambda x: (str(x[0]), str(x[1])))
hadoop2.collect()


hadoop22 = sc.newAPIHadoopFile("file:///home/orienit/spark/output/data",
"org.apache.hadoop.mapreduce.lib.input.TextInputFormat",
"org.apache.hadoop.io.LongWritable", "org.apache.hadoop.io.Text").map(lambda x : (long(x[1].split("\t")[0]), str(x[1].split("\t")[1])))

hadoop22.collect()



hadoop2_seq = sc.newAPIHadoopFile("file:///home/orienit/spark/output/tupleData",
"org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat",
"org.apache.hadoop.io.LongWritable", "org.apache.hadoop.io.Text").map(lambda x: (long(x[0]), str(x[1])))

hadoop2_seq.collect()






1 comment:

spark_streaming_examples

Create Spark Streaming Context: ========================================== scala: --------------- import org.apache.spark._ import ...