1.collect算子

*使用foreachACTION操作 ,collect在远程集群中遍历RDD的元素

*使用collect操作,将分布式在远程集群中的数据拉取到本地
*这种方式不建议使用,如果数据量大,会使用大量 的网络带宽
*这种方式不建议使用。

package kw.test.action;

import java.util.Arrays;
import java.util.Iterator;
import java.util.List; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction; public class Collect {
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf().setAppName("FlatMap").setMaster("local");
JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
List<String> list = Arrays.asList("wo am xues","ni shi la ji ","zha zs","zha df");
JavaRDD<String> javaRDD = javaSparkContext.parallelize(list);
JavaRDD<String> result = javaRDD.flatMap(new FlatMapFunction<String, String>() { @Override
public Iterator<String> call(String arg0) throws Exception {
// TODO Auto-generated method stub
//首选将数据分割,然后将和数据值机型压扁。
return Arrays.asList(arg0.split(" ")).iterator();
}
});
List<String> str = result.collect();
for(String s : str)
{
System.out.println(s);
} }
}

 2.count

* 这个是一个action,他是有返回值的。

package kw.test.action;

import java.util.Arrays;
import java.util.List; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; /*
* 这个是一个action,他是有返回值的。
*/
public class count {
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf().setAppName("FlatMap").setMaster("local");
JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
List<Integer> list = Arrays.asList(,,,,,,,,);
JavaRDD<Integer> javaRDD = javaSparkContext.parallelize(list);
long num = javaRDD.count();
System.out.println(num);
} }

3.filter

*将返回true的数据进行输出

package kw.test.action;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction; public class Filter {
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf().setAppName("Filter").setMaster("local");
JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
List<Integer> list = Arrays.asList(,,,,,);
JavaRDD<Integer> javaRDD = javaSparkContext.parallelize(list);
JavaRDD<Integer> redult = javaRDD.filter(new Function<Integer, Boolean>() { @Override
public Boolean call(Integer arg0) throws Exception {
// TODO Auto-generated method stub
return arg0%==;//返回的为true的就可以了。
}
});
redult.foreach(new VoidFunction<Integer>() { @Override
public void call(Integer arg0) throws Exception {
// TODO Auto-generated method stub
System.out.println(arg0);
}
});
}
}

4.coalesce

package kw.test.action;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.VoidFunction;
/*
* 此参数将返回一个RDD,分区的参数会别为设置的哪一个参数的个数
*
* 这个返回是一个窄依赖,如果将一个100的变为10个partition的时候,这个时候
* 不会进行shuffle运算,
*
* 如果将好多个partition变为一个的时候,这个时候需要使用shuffle,他会进行shuffle的传递
* 一般是较少使用,因为需要将数据进行传递。
*
*
*
* 不仅仅是将partition变得更少,同时也可以将partition变的更大,这个时候需要将shuffle变为true
* 如果将partition变的更多的时候,也是需要将将shuffle设置为true的。
*/ public class FilterOperter {
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf().setAppName("Coalesce").setMaster("local");
JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
List<Integer> list = Arrays.asList(1,2,3,4,5,6);
JavaRDD<Integer> numbers = javaSparkContext.parallelize(list,6);
//Colesce算子,在执行filter之后,有的partition上的数据就会变得很少,容易造成数据的倾斜
JavaRDD<String> result = numbers.mapPartitionsWithIndex(new Function2<Integer, Iterator<Integer>, Iterator<String>>() { @Override
public Iterator<String> call(Integer arg0, Iterator<Integer> arg1)
throws Exception {
// TODO Auto-generated method stub
List<String> list = new ArrayList<String>();
while(arg1.hasNext())
{
list.add(arg0+" "+arg1.next());
}
return list.iterator();
}
}, true); result.foreach(new VoidFunction<String>() { @Override
public void call(String arg1) throws Exception {
// TODO Auto-generated method stub
System.out.println(arg1+" ");
} });
JavaRDD<String> javaCoaxlesce = result.coalesce(3);
JavaRDD<String> coalesce = javaCoaxlesce.mapPartitionsWithIndex(new Function2<Integer, Iterator<String>, Iterator<String>>() { @Override
public Iterator<String> call(Integer arg0, Iterator<String> arg1)
throws Exception {
// TODO Auto-generated method stub
List<String> list = new ArrayList<String>();
while(arg1.hasNext())
{
list.add(arg1.next()+" "+arg0); }
return list.iterator();
}
}, true);//参数的含义:true的含义就是是否进行shuffle。默认是不进行shuffle
coalesce.foreach(new VoidFunction<String>() { @Override
public void call(String arg0) throws Exception {
// TODO Auto-generated method stub
System.out.println(" "+ arg0);
}
});
} }

 5.floatmap

package kw.test.action;

import java.util.Arrays;
import java.util.Iterator;
import java.util.List; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.VoidFunction; /*
* 先执行一个map操作,然后将数据flat压扁
*/
public class FlatMap {
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf().setAppName("FlatMap").setMaster("local");
JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
List<String> list = Arrays.asList("wo am xues","ni shi la ji ","zha zs","zha df");
JavaRDD<String> javaRDD = javaSparkContext.parallelize(list);
JavaRDD<String> result = javaRDD.flatMap(new FlatMapFunction<String, String>() { @Override
public Iterator<String> call(String arg0) throws Exception {
// TODO Auto-generated method stub
//首选将数据分割,然后将和数据值机型压扁。
return Arrays.asList(arg0.split(" ")).iterator();
}
});
result.foreach(new VoidFunction<String>() { @Override
public void call(String arg0) throws Exception {
// TODO Auto-generated method stub
System.out.println(arg0);
}
}); } }

6.mappartition

package kw.test.action;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.VoidFunction; public class MapPartition {
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf().setAppName("MapPartition").setMaster("local");
JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
List<String> list = Arrays.asList("wo","shi","lal","woowj","dffdds");
JavaRDD<String> javaRDD = sparkContext.parallelize(list); JavaRDD<String> result = javaRDD.mapPartitions(new FlatMapFunction<Iterator<String>, String>() { //将partition的数据全部给map,然后使用迭代器处理
@Override
public Iterator<String> call(Iterator<String> arg0)throws Exception {
// TODO Auto-generated method stub
List<String> list = new ArrayList<String>();
while(arg0.hasNext() )
{
list.add(arg0.next());
}
return list.iterator();
}
});
result.foreach(new VoidFunction<String>() { @Override
public void call(String arg0) throws Exception {
// TODO Auto-generated method stub
System.out.println(arg0);
}}); } }

  7.MapPartitionsWithIndex

package kw.test.action;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.VoidFunction; /*
*k可以获取到里面的分区位置
*/
public class MapPartitionsWithIndex {
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf().setAppName("MapPartitionsWithIndex").setMaster("local");
JavaSparkContext sparkContext =new JavaSparkContext(sparkConf); List<String> list = Arrays.asList("kang","wang","ddd","kang1","wang2","ddd3");
JavaRDD<String> javaRDD = sparkContext.parallelize(list,4); JavaRDD<String> javaRDD2 = javaRDD.mapPartitionsWithIndex(new Function2<Integer, Iterator<String>, Iterator<String>>() { @Override
public Iterator<String> call(Integer arg0, Iterator<String> arg1)
throws Exception {
// TODO Auto-generated method stub
List<String> list = new ArrayList<String>();
while(arg1.hasNext())
{
list.add(arg1.next()+" "+ arg0);
}
return list.iterator();
}
}, true);
javaRDD2.foreach(new VoidFunction<String>() { @Override
public void call(String arg0) throws Exception {
// TODO Auto-generated method stub
System.out.println(arg0);
}
});
}
}

  8.Reduce

package kw.test.action;

import java.util.Arrays;
import java.util.List; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2; public class Reduce {
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf().setAppName("Reduce").setMaster("local");
JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
List<Integer> list= Arrays.asList(1,2,3,4,5,6,7);
JavaRDD<Integer> num = javaSparkContext.parallelize(list);
int sum = num.reduce(new Function2<Integer, Integer, Integer>() { @Override
public Integer call(Integer arg0, Integer arg1) throws Exception {
// TODO Auto-generated method stub
return arg0+arg1;
}
});
System.out.println("最终结果:"+sum);
} }

9.ReduceByKey

package kw.test.action;

import java.util.Arrays;
import java.util.List; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.VoidFunction; import scala.Tuple2; /*
* ReduceByKey是一个shuffle操作
*
* 在shuffle的时候,分为map端和reduce端
*
* spark里面的reduceByKey在map端映带conbiner。
* 也就是在map中处理然后将其累加,减少了网络的传输,效率更高。
*/
public class ReduceByKey {
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf().setAppName("Reduce").setMaster("local");
JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
List<Tuple2<String, Integer>> list= Arrays.asList(new Tuple2<String,Integer>("kang",),
new Tuple2<String,Integer>("kang",),
new Tuple2<String,Integer>("kang",),
new Tuple2<String,Integer>("kang",),
new Tuple2<String,Integer>("wang",));
JavaPairRDD<String, Integer> num = javaSparkContext.parallelizePairs(list);
JavaPairRDD<String, Integer> rr = num.reduceByKey(new Function2<Integer, Integer, Integer>() { @Override
public Integer call(Integer arg0, Integer arg1) throws Exception {
// TODO Auto-generated method stub
return arg0+arg1;
}
});
rr.foreach(new VoidFunction<Tuple2<String,Integer>>() { @Override
public void call(Tuple2<String, Integer> arg0) throws Exception {
// TODO Auto-generated method stub
System.out.println(arg0);
}
});
} }

10.RePartition

package kw.test.action;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.VoidFunction; /*
* repartition算子,用于任意数据量RDD的partition增多或者减少。
* coalesce仅仅将RDD的数量减少【其实不是这样的】
* 、
*
* 建议使用的场景:
* 一个很经典的场景,使用spark SQL从HIVE中查询数据的时候,spark SQLhui
* 会根据HIVE对应的hdfs文件的block的数量决定加载出来的RDD的个数是
* 多少个,这里默认的partition的数量是我们根本无法设置的
*
*
* 有的时候,可能他会自动的设置partition的数量过于少,就进行优化
* 可以提高并发度,就是对RDD使用的partition的算子。
*
*
* 一般情况下,我们为了减少shuffle的时候,我们首选使用coalesce
* 因为他可以避免shuffle操作。
*
*
*/
public class RePartition {
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf().setAppName("Coalesce").setMaster("local");
JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
List<Integer> list = Arrays.asList(1,2,3,4,5,6); JavaRDD<Integer> numbers = javaSparkContext.parallelize(list); JavaRDD<String> result = numbers.mapPartitionsWithIndex(new Function2<Integer, Iterator<Integer>, Iterator<String>>() { @Override
public Iterator<String> call(Integer arg0, Iterator<Integer> arg1)
throws Exception {
// TODO Auto-generated method stub List<String> list = new ArrayList<String>();
while(arg1.hasNext())
{
list.add(arg1.next()+" "+arg0);
}
return list.iterator();
}
}, true);
result.foreach(new VoidFunction<String>() { @Override
public void call(String arg0) throws Exception {
// TODO Auto-generated method stub
System.out.println(arg0+" ");
}
}); result.repartition(5);
JavaRDD<String> result2 = result.mapPartitionsWithIndex(new Function2<Integer, Iterator<String>, Iterator<String>>() { @Override
public Iterator<String> call(Integer arg0, Iterator<String> arg1)
throws Exception {
// TODO Auto-generated method stub
List<String> list = new ArrayList<String>();
while(arg1.hasNext())
{
list.add(arg1.next()+" "+arg0);
}
return list.iterator();
} }, true);
result2.foreach(new VoidFunction<String>() { @Override
public void call(String arg0) throws Exception {
// TODO Auto-generated method stub
System.out.println(arg0+" ");
}
});
} }

11.Sample

package kw.test.action;

import java.util.Arrays;
import java.util.List; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.VoidFunction; /*
* 随机采样从,可以传入一个Float,不如数0.3就是采样30%
* 比如说我在测试的时候使用的是5个单词,最后也就显示的是1个,当然了 Float的值是可以修改的,参数的个数也是可以自己给定的。
*
* 这个也可以设置partition的个数,如果是一个就先手wordnum*float,如果是两个的时候,他们的值是每一个里面取出wordNum*float的值
*/
public class Sample {
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf().setAppName("sample").setMaster("local");
JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
List<String> list = Arrays.asList("kwang","wang","is","a","student");
JavaRDD<String> num = javaSparkContext.parallelize(list);
//参数:false就是抽取之后,不会将数据放回,最终的结果是不会有重复的,如果是true会将数据放回再次抽取,最终结果就会有重复的。
//它的第三个参数,种子,如果不指定就会自动产生一个种子,所以每次的结果都是不一样的,但是如果指定一个种子
//那么它的最终的结果都是一样的。此参数一般作为测试使用。
num.sample(false, 0.8).foreach(new VoidFunction<String>() { @Override
public void call(String arg0) throws Exception {
// TODO Auto-generated method stub
System.out.println(arg0);
}
});
} }

12.Take

package kw.test.action;

import java.util.Arrays;
import java.util.List; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; /*
* take是取数据,参数是几,我们就取其中的几个参数出来
*/
public class Take {
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf().setAppName("take").setMaster("local");
JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
List<String> list = Arrays.asList("kang","wang","lala");
JavaRDD<String> result = javaSparkContext.parallelize(list);
/*
* List<String> list1 = Arrays.asList("kang","wang","lala");
List<Integer> list = Arrays.asList(1,2,3,4,5);
JavaRDD<Integer> result = javaSparkContext.parallelize(list);
JavaRDD<String> result1 = javaSparkContext.parallelize(list1);
result.take(2);
result1.take(num); */
List<String> name = result.take();
for(String value :name)
{
System.out.println(value);
}
} }

13.TakeSample

package kw.test.action;

import java.util.Arrays;
import java.util.List; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; /*
* take是取出数据,sample是随机的取出数据
*
* takeSample先进行sample在进行采样
*/
public class TakeSample {
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf().setAppName("takesample").setMaster("local");
JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
List<String> list = Arrays.asList("kang1","wang1","lala1","kang2","wang2","lala2","kang3","wang3","lala3");
JavaRDD<String> result = javaSparkContext.parallelize(list);
//第一个参数是否放回,第二个参数是数据的个数,第三个参数是种子。 如果种子一样,每一次的数据都是一样的,如果种子不一样,每一次的参数都不一样。
List<String> value = result.takeSample(false, 3);
for(String v:value)
{
System.out.println(v);
}
}
}

14.Union

package kw.test.action;

import java.util.Arrays;
import java.util.List; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.VoidFunction; /*
* 将两个list变为一个list,他不会进行shuffle操作,加入开始都是两个partition,
* 将他们union之后,会是四个
*/
public class Union {
public static void main(String[] args) { // TODO Auto-generated method stub
SparkConf sparkConf = new SparkConf().setAppName("union").setMaster("local");
JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
List<String> list1 = Arrays.asList("kang1","wang1","lala1","kang2","wang2","lala2");
List<String> list2 = Arrays.asList("kang3","wang3","lala3");
JavaRDD<String> result1 = javaSparkContext.parallelize(list1,2);
JavaRDD<String> result2 = javaSparkContext.parallelize(list2,2);
JavaRDD <String> unionre = result1.union(result2);
unionre.foreach(new VoidFunction<String>() { @Override
public void call(String arg0) throws Exception {
// TODO Auto-generated method stub
System.out.println(arg0);
}
});
}
}

  

05-11 18:31