===================================================================
Hadoop Spark Examples using scala
===================================================================
val key = sc.parallelize(List(1,2,3,4))
val value = sc.parallelize(List("a","b","c","d"))
val mapRdd = key.zip(value)
val tabData = mapRdd.map(x => x._1 + "\t" + x._2)
val tupleData = mapRdd.map(x => (x._1, x._2))
tabData.saveAsTextFile("/home/orienit/spark/output/tabData")
tabData.saveAsTextFile("file:///home/orienit/spark/output/tabData")
tupleData.saveAsSequenceFile("/home/orienit/spark/output/tupleData")
tupleData.saveAsSequenceFile("file:///home/orienit/spark/output/tupleData")
import org.apache.hadoop.conf._
import org.apache.hadoop.io._
import org.apache.hadoop.mapred._
import org.apache.hadoop.mapreduce.lib.input._
import org.apache.hadoop.mapreduce.lib.output._
==============================================
Store data into Hadoop using Hadoop Api's
==============================================
// for text format only
val data_1 = mapRdd.map(x => (x._1, x._2))
(or)
// any other formats //
val data_1 = mapRdd.map(x => (new LongWritable(x._1), new Text(x._2)))
val conf = new Configuration()
data_1.saveAsHadoopFile("file:///home/orienit/spark/output/data_1", classOf[LongWritable], classOf[Text], classOf[org.apache.hadoop.mapred.TextOutputFormat[LongWritable, Text]]);
data_1.saveAsHadoopFile("file:///home/orienit/spark/output/tupleData_1", classOf[LongWritable], classOf[Text], classOf[org.apache.hadoop.mapred.SequenceFileOutputFormat[LongWritable, Text]]);
data_1.saveAsNewAPIHadoopFile("file:///home/orienit/spark/output/data_2", classOf[LongWritable], classOf[Text], classOf[org.apache.hadoop.mapreduce.lib.output.TextOutputFormat[LongWritable, Text]], conf);
data_1.saveAsNewAPIHadoopFile("file:///home/orienit/spark/output/tupleData_2", classOf[LongWritable], classOf[Text], classOf[org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat[LongWritable, Text]], conf);
==============================================
Load data from Hadoop using Hadoop Api's
==============================================
val data = sc.textFile("/home/orienit/spark/output/data").map(x => (x.split("\t")(0),x.split("\t")(1)))
data.collect
Hadoop Old Api:
=====================
val hadoop1 = sc.hadoopFile[Text, Text, org.apache.hadoop.mapred.KeyValueTextInputFormat]("/home/orienit/spark/output/data").map{ case (x, y) => (x.toString, y.toString) }
hadoop1.collect
val hadoop11 = sc.hadoopFile[LongWritable, Text, org.apache.hadoop.mapred.TextInputFormat]("/home/orienit/spark/output/data").map{ case (x, y) => (y.toString.split("\t")(0),y.toString.split("\t")(1)) }
hadoop11.collect
val hadoop1_seq = sc.hadoopFile[IntWritable, Text, org.apache.hadoop.mapred.SequenceFileInputFormat[IntWritable, Text]]("/home/orienit/spark/output/tupleData").map{ case (x, y) => (x.toString, y.toString) }
(or)
val hadoop1_seq = sc.hadoopFile("/home/orienit/spark/output/tupleData",classOf[org.apache.hadoop.mapred.SequenceFileInputFormat[IntWritable,Text]],classOf[IntWritable], classOf[Text]).map{ case (x, y) => (x.toString, y.toString) }
hadoop1_seq.collect
Hadoop New Api:
=====================
val hadoop2 = sc.newAPIHadoopFile[Text, Text, org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat]("/home/orienit/spark/output/data").map{ case (x, y) => (x.toString, y.toString) }
hadoop2.collect
val hadoop22 = sc.newAPIHadoopFile[LongWritable, Text, org.apache.hadoop.mapreduce.lib.input.TextInputFormat]("/home/orienit/spark/output/data").map{ case (x, y) => (y.toString.split("\t")(0),y.toString.split("\t")(1)) }
hadoop22.collect
val hadoop2_seq = sc.newAPIHadoopFile[IntWritable, Text, org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat[IntWritable, Text]]("/home/orienit/spark/output/tupleData").map{ case (x, y) => (x.toString, y.toString) }
(or)
val hadoop2_seq = sc.newAPIHadoopFile("/home/orienit/spark/output/tupleData", classOf[org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat[IntWritable, Text]], classOf[IntWritable], classOf[Text]).map{ case (x, y) => (x.toString, y.toString) }
hadoop2_seq.collect
Frequent Calls in Spark:
==============================
rdd.saveAsHadoopFile(outputpath, classOf[Key], classOf[Value], classOf[OutputFormat[Key,Value]]);
rdd.saveAsNewAPIHadoopFile(outputpath, classOf[Key], classOf[Value], classOf[OutputFormat[Key,Value]]);
val rdd = sc.hadoopFile[Key, Value, InputFormat[Key,Value]](inputpath)
val rdd = sc.hadoopFile(inputpath, classOf[InputFormat[Key,Value]], classOf[Key], classOf[Value])
val rdd = sc.newAPIHadoopFile[Key, Value, InputFormat[Key,Value]](inputpath)
val rdd = sc.newAPIHadoopFile(inputpath, classOf[InputFormat[Key,Value]], classOf[Key], classOf[Value])
val rdd = sc.textFile(inputpath)
val rdd = sc.parallelize(Object)
rdd.saveAsTextFile(outputpath)
rdd.saveAsSequenceFile(outputpath)
rdd.saveAsObjectFile(outputpath)
===================================================================
Hadoop Spark Examples using python
===================================================================
key = sc.parallelize([1,2,3,4])
value = sc.parallelize(["a","b","c","d"])
map = key.zip(value)
data = mapRdd.map(lambda x : "\t".join([str(x[0]), x[1]]))
tupleData = mapRdd.map(lambda x : (x[0], x[1]))
data.saveAsTextFile("/home/orienit/spark/output/data")
data.saveAsTextFile("file:///home/orienit/spark/output/data")
tupleData.saveAsSequenceFile("/home/orienit/spark/output/tupleData")
tupleData.saveAsSequenceFile("file:///home/orienit/spark/output/tupleData")
==============================================
Store data into Hadoop using Hadoop Api's
==============================================
// for text format only
data_1 = mapRdd.map(lambda x : (x[0], x[1]))
(or)
// any other formats
data_1 = mapRdd.map(lambda x : (long(x[0]), x[1]))
data_1.saveAsHadoopFile("file:///home/orienit/spark/output/data_1","org.apache.hadoop.mapred.TextOutputFormat",
"org.apache.hadoop.io.LongWritable", "org.apache.hadoop.io.Text");
data_1.saveAsHadoopFile("file:///home/orienit/spark/output/tupleData_1",
"org.apache.hadoop.mapred.SequenceFileOutputFormat",
"org.apache.hadoop.io.LongWritable",
"org.apache.hadoop.io.Text");
data_1.saveAsNewAPIHadoopFile("file:///home/orienit/spark/output/data_2","org.apache.hadoop.mapreduce.lib.output.TextOutputFormat",
"org.apache.hadoop.io.LongWritable",
"org.apache.hadoop.io.Text");
data_1.saveAsNewAPIHadoopFile("file:///home/orienit/spark/output/tupleData_2",
"org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat",
"org.apache.hadoop.io.LongWritable",
"org.apache.hadoop.io.Text");
==============================================
Load data from Hadoop using Hadoop Api's
==============================================
data = sc.textFile("/home/orienit/spark/output/data").map(lambda x : (x.split("\t")[0],x.split("\t")[1]))
data.collect()
Hadoop Old Api:
=====================
hadoop1 = sc.hadoopFile("file:///home/orienit/spark/output/data",
"org.apache.hadoop.mapred.KeyValueTextInputFormat",
"org.apache.hadoop.io.Text",
"org.apache.hadoop.io.Text").map(lambda x: (str(x[0]), str(x[1])))
hadoop1.collect()
hadoop11 = sc.hadoopFile("file:///home/orienit/spark/output/data",
"org.apache.hadoop.mapred.TextInputFormat","org.apache.hadoop.io.LongWritable",
"org.apache.hadoop.io.Text").map(lambda x : (long(x[1].split("\t")[0]), str(x[1].split("\t")[1])))
hadoop11.collect()
hadoop1_seq = sc.hadoopFile("file:///home/orienit/spark/output/tupleData",
"org.apache.hadoop.mapred.SequenceFileInputFormat",
"org.apache.hadoop.io.LongWritable",
"org.apache.hadoop.io.Text").map(lambda x: (long(x[0]), str(x[1])))
hadoop1_seq.collect()
Hadoop New Api:
=====================
hadoop2 = sc.newAPIHadoopFile("file:///home/orienit/spark/output/data",
"org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat","org.apache.hadoop.io.Text",
"org.apache.hadoop.io.Text").map(lambda x: (str(x[0]), str(x[1])))
hadoop2.collect()
hadoop22 = sc.newAPIHadoopFile("file:///home/orienit/spark/output/data",
"org.apache.hadoop.mapreduce.lib.input.TextInputFormat",
"org.apache.hadoop.io.LongWritable", "org.apache.hadoop.io.Text").map(lambda x : (long(x[1].split("\t")[0]), str(x[1].split("\t")[1])))
hadoop22.collect()
hadoop2_seq = sc.newAPIHadoopFile("file:///home/orienit/spark/output/tupleData",
"org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat",
"org.apache.hadoop.io.LongWritable", "org.apache.hadoop.io.Text").map(lambda x: (long(x[0]), str(x[1])))
hadoop2_seq.collect()
Hadoop Spark Examples using scala
===================================================================
val key = sc.parallelize(List(1,2,3,4))
val value = sc.parallelize(List("a","b","c","d"))
val mapRdd = key.zip(value)
val tabData = mapRdd.map(x => x._1 + "\t" + x._2)
val tupleData = mapRdd.map(x => (x._1, x._2))
tabData.saveAsTextFile("/home/orienit/spark/output/tabData")
tabData.saveAsTextFile("file:///home/orienit/spark/output/tabData")
tupleData.saveAsSequenceFile("/home/orienit/spark/output/tupleData")
tupleData.saveAsSequenceFile("file:///home/orienit/spark/output/tupleData")
import org.apache.hadoop.conf._
import org.apache.hadoop.io._
import org.apache.hadoop.mapred._
import org.apache.hadoop.mapreduce.lib.input._
import org.apache.hadoop.mapreduce.lib.output._
==============================================
Store data into Hadoop using Hadoop Api's
==============================================
// for text format only
val data_1 = mapRdd.map(x => (x._1, x._2))
(or)
// any other formats //
val data_1 = mapRdd.map(x => (new LongWritable(x._1), new Text(x._2)))
val conf = new Configuration()
data_1.saveAsHadoopFile("file:///home/orienit/spark/output/data_1", classOf[LongWritable], classOf[Text], classOf[org.apache.hadoop.mapred.TextOutputFormat[LongWritable, Text]]);
data_1.saveAsHadoopFile("file:///home/orienit/spark/output/tupleData_1", classOf[LongWritable], classOf[Text], classOf[org.apache.hadoop.mapred.SequenceFileOutputFormat[LongWritable, Text]]);
data_1.saveAsNewAPIHadoopFile("file:///home/orienit/spark/output/data_2", classOf[LongWritable], classOf[Text], classOf[org.apache.hadoop.mapreduce.lib.output.TextOutputFormat[LongWritable, Text]], conf);
data_1.saveAsNewAPIHadoopFile("file:///home/orienit/spark/output/tupleData_2", classOf[LongWritable], classOf[Text], classOf[org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat[LongWritable, Text]], conf);
==============================================
Load data from Hadoop using Hadoop Api's
==============================================
val data = sc.textFile("/home/orienit/spark/output/data").map(x => (x.split("\t")(0),x.split("\t")(1)))
data.collect
Hadoop Old Api:
=====================
val hadoop1 = sc.hadoopFile[Text, Text, org.apache.hadoop.mapred.KeyValueTextInputFormat]("/home/orienit/spark/output/data").map{ case (x, y) => (x.toString, y.toString) }
hadoop1.collect
val hadoop11 = sc.hadoopFile[LongWritable, Text, org.apache.hadoop.mapred.TextInputFormat]("/home/orienit/spark/output/data").map{ case (x, y) => (y.toString.split("\t")(0),y.toString.split("\t")(1)) }
hadoop11.collect
val hadoop1_seq = sc.hadoopFile[IntWritable, Text, org.apache.hadoop.mapred.SequenceFileInputFormat[IntWritable, Text]]("/home/orienit/spark/output/tupleData").map{ case (x, y) => (x.toString, y.toString) }
(or)
val hadoop1_seq = sc.hadoopFile("/home/orienit/spark/output/tupleData",classOf[org.apache.hadoop.mapred.SequenceFileInputFormat[IntWritable,Text]],classOf[IntWritable], classOf[Text]).map{ case (x, y) => (x.toString, y.toString) }
hadoop1_seq.collect
Hadoop New Api:
=====================
val hadoop2 = sc.newAPIHadoopFile[Text, Text, org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat]("/home/orienit/spark/output/data").map{ case (x, y) => (x.toString, y.toString) }
hadoop2.collect
val hadoop22 = sc.newAPIHadoopFile[LongWritable, Text, org.apache.hadoop.mapreduce.lib.input.TextInputFormat]("/home/orienit/spark/output/data").map{ case (x, y) => (y.toString.split("\t")(0),y.toString.split("\t")(1)) }
hadoop22.collect
val hadoop2_seq = sc.newAPIHadoopFile[IntWritable, Text, org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat[IntWritable, Text]]("/home/orienit/spark/output/tupleData").map{ case (x, y) => (x.toString, y.toString) }
(or)
val hadoop2_seq = sc.newAPIHadoopFile("/home/orienit/spark/output/tupleData", classOf[org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat[IntWritable, Text]], classOf[IntWritable], classOf[Text]).map{ case (x, y) => (x.toString, y.toString) }
hadoop2_seq.collect
Frequent Calls in Spark:
==============================
rdd.saveAsHadoopFile(outputpath, classOf[Key], classOf[Value], classOf[OutputFormat[Key,Value]]);
rdd.saveAsNewAPIHadoopFile(outputpath, classOf[Key], classOf[Value], classOf[OutputFormat[Key,Value]]);
val rdd = sc.hadoopFile[Key, Value, InputFormat[Key,Value]](inputpath)
val rdd = sc.hadoopFile(inputpath, classOf[InputFormat[Key,Value]], classOf[Key], classOf[Value])
val rdd = sc.newAPIHadoopFile[Key, Value, InputFormat[Key,Value]](inputpath)
val rdd = sc.newAPIHadoopFile(inputpath, classOf[InputFormat[Key,Value]], classOf[Key], classOf[Value])
val rdd = sc.textFile(inputpath)
val rdd = sc.parallelize(Object)
rdd.saveAsTextFile(outputpath)
rdd.saveAsSequenceFile(outputpath)
rdd.saveAsObjectFile(outputpath)
===================================================================
Hadoop Spark Examples using python
===================================================================
key = sc.parallelize([1,2,3,4])
value = sc.parallelize(["a","b","c","d"])
map = key.zip(value)
data = mapRdd.map(lambda x : "\t".join([str(x[0]), x[1]]))
tupleData = mapRdd.map(lambda x : (x[0], x[1]))
data.saveAsTextFile("/home/orienit/spark/output/data")
data.saveAsTextFile("file:///home/orienit/spark/output/data")
tupleData.saveAsSequenceFile("/home/orienit/spark/output/tupleData")
tupleData.saveAsSequenceFile("file:///home/orienit/spark/output/tupleData")
==============================================
Store data into Hadoop using Hadoop Api's
==============================================
// for text format only
data_1 = mapRdd.map(lambda x : (x[0], x[1]))
(or)
// any other formats
data_1 = mapRdd.map(lambda x : (long(x[0]), x[1]))
data_1.saveAsHadoopFile("file:///home/orienit/spark/output/data_1","org.apache.hadoop.mapred.TextOutputFormat",
"org.apache.hadoop.io.LongWritable", "org.apache.hadoop.io.Text");
data_1.saveAsHadoopFile("file:///home/orienit/spark/output/tupleData_1",
"org.apache.hadoop.mapred.SequenceFileOutputFormat",
"org.apache.hadoop.io.LongWritable",
"org.apache.hadoop.io.Text");
data_1.saveAsNewAPIHadoopFile("file:///home/orienit/spark/output/data_2","org.apache.hadoop.mapreduce.lib.output.TextOutputFormat",
"org.apache.hadoop.io.LongWritable",
"org.apache.hadoop.io.Text");
data_1.saveAsNewAPIHadoopFile("file:///home/orienit/spark/output/tupleData_2",
"org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat",
"org.apache.hadoop.io.LongWritable",
"org.apache.hadoop.io.Text");
==============================================
Load data from Hadoop using Hadoop Api's
==============================================
data = sc.textFile("/home/orienit/spark/output/data").map(lambda x : (x.split("\t")[0],x.split("\t")[1]))
data.collect()
Hadoop Old Api:
=====================
hadoop1 = sc.hadoopFile("file:///home/orienit/spark/output/data",
"org.apache.hadoop.mapred.KeyValueTextInputFormat",
"org.apache.hadoop.io.Text",
"org.apache.hadoop.io.Text").map(lambda x: (str(x[0]), str(x[1])))
hadoop1.collect()
hadoop11 = sc.hadoopFile("file:///home/orienit/spark/output/data",
"org.apache.hadoop.mapred.TextInputFormat","org.apache.hadoop.io.LongWritable",
"org.apache.hadoop.io.Text").map(lambda x : (long(x[1].split("\t")[0]), str(x[1].split("\t")[1])))
hadoop11.collect()
hadoop1_seq = sc.hadoopFile("file:///home/orienit/spark/output/tupleData",
"org.apache.hadoop.mapred.SequenceFileInputFormat",
"org.apache.hadoop.io.LongWritable",
"org.apache.hadoop.io.Text").map(lambda x: (long(x[0]), str(x[1])))
hadoop1_seq.collect()
Hadoop New Api:
=====================
hadoop2 = sc.newAPIHadoopFile("file:///home/orienit/spark/output/data",
"org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat","org.apache.hadoop.io.Text",
"org.apache.hadoop.io.Text").map(lambda x: (str(x[0]), str(x[1])))
hadoop2.collect()
hadoop22 = sc.newAPIHadoopFile("file:///home/orienit/spark/output/data",
"org.apache.hadoop.mapreduce.lib.input.TextInputFormat",
"org.apache.hadoop.io.LongWritable", "org.apache.hadoop.io.Text").map(lambda x : (long(x[1].split("\t")[0]), str(x[1].split("\t")[1])))
hadoop22.collect()
hadoop2_seq = sc.newAPIHadoopFile("file:///home/orienit/spark/output/tupleData",
"org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat",
"org.apache.hadoop.io.LongWritable", "org.apache.hadoop.io.Text").map(lambda x: (long(x[0]), str(x[1])))
hadoop2_seq.collect()
Good information
ReplyDeleteBig Data and Hadoop Online Training