spark Accumulator累加器使用示例

Posted zz-ksw

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了spark Accumulator累加器使用示例相关的知识,希望对你有一定的参考价值。

官网

http://spark.apache.org/docs/2.3.1/rdd-programming-guide.html#accumulators

http://spark.apache.org/docs/2.3.1/api/scala/index.html#org.apache.spark.util.AccumulatorV2

Accumulator是spark提供的累加器,累加器的一个常用用途是在调试时对作业执行过程中的事件进行计数,但是只要driver能获取Accumulator的值(调用value方法), Task只能对其做增加操作,也可以给Accumulator命名(不支持Python),这样就可以在spark web ui中查看, 可以帮助了解程序运行的情况。

数值累加器可通过调用SparkContext.longAccumulator()或SparkContext,doubleAccumulator()创建,分别用于累加Long或Double类型的值。运行在集群上的任务之后可使用add方法进行累加操作。但是,这些任务并不能读取累加器的值,只有驱动程序使用value方法能读取累加器的值。

 

spark内置累加器使用示例

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.util.LongAccumulator;

import java.util.Arrays;
import java.util.HashSet;

/**
 * spark内置了数值类型的累加器,比如LongAccumulator、DoubleAccumulator
 */
public class TestAccumulator {


    public static void main(String[] args) {
        SparkConf conf = new SparkConf();
        SparkSession spark = SparkSession
                .builder()
                .appName("gxl")
                .master("local")
                .config(conf)
                .enableHiveSupport()
                .getOrCreate();

        JavaSparkContext jsc = JavaSparkContext.fromSparkContext(spark.sparkContext());

        LongAccumulator accum = jsc.sc().longAccumulator();
        JavaRDD<Integer> javaRDD = jsc.parallelize(Arrays.asList(1, 2, 3));

//        onlyMapOperator(accum,javaRDD);
//        accumInActionOperator(accum,javaRDD);
//        accumExecuteRepeatedly(accum,javaRDD);
    }


    private static void accumInActionOperator(LongAccumulator accum,JavaRDD<Integer> javaRDD){
        javaRDD.foreach(x -> accum.add(x));
        System.out.println(accum.value());
    }

    private static void onlyMapOperator(LongAccumulator accum,JavaRDD<Integer> javaRDD){
        //累加器也是lazy的,只有map操作的算子,累加器不会执行
        javaRDD.map((Function<Integer, Integer>) v1 -> {
            accum.add(v1);
            return v1;
        });
        System.out.println(accum.value());
    }

    private static void accumExecuteRepeatedly(LongAccumulator accum,JavaRDD<Integer> javaRDD){
        JavaRDD<Integer> map = javaRDD.map((Function<Integer, Integer>) v1 -> {
            accum.add(v1);
            return v1;
        });
//        map.count();
//        System.out.println(accum.value());
//        map.reduce((Function2<Integer, Integer, Integer>) (v1, v2) -> v1+v2);
//        System.out.println(accum.value());

        //将map后的rdd缓存起来
        JavaRDD<Integer> cache = map.cache();
        cache.count();
        System.out.println(accum.value());
        cache.reduce((Function2<Integer, Integer, Integer>) (v1, v2) -> v1+v2);
        System.out.println(accum.value());
    }

}

 

自定义spark累加器使用示例

import org.apache.spark.util.AccumulatorV2;

/**
 * spark 2.3
 * 自定义累计器需要继承AccumulatorV2,并且重写以下方法
 * 将符合条件的数据拼接在一起
 */
public class MyAccumulator extends AccumulatorV2<String,String> {

    private StringBuffer stringBuffer = new StringBuffer();
    /**
     * Returns if this accumulator is zero value or not.
     * 返回该累加器是否为零值。
     * @return
     */
    @Override
    public boolean isZero() {
        return stringBuffer.length() == 0;
    }

    /**
     * Creates a new copy of this accumulator.
     * @return
     */
    @Override
    public AccumulatorV2<String,String> copy() {
        MyAccumulator newMyAccu = new MyAccumulator();
        newMyAccu.stringBuffer.append(stringBuffer);
        return newMyAccu;
    }

    /**
     * Resets this accumulator, which is zero value.
     */
    @Override
    public void reset() {
        stringBuffer.setLength(0);
    }

    /**
     * Takes the inputs and accumulates.
     * @param input
     */
    @Override
    public void add(String input) {
        stringBuffer.append(input).append(",");
    }

    /**
     * Merges another same-type accumulator into this one and update its state, i.e.
     * @param other
     */
    @Override
    public void merge(AccumulatorV2 other) {
        stringBuffer.append(other.value());
    }

    /**
     * Defines the current value of this accumulator
     * @return
     */
    @Override
    public String value() {
        return stringBuffer.toString();
    }
}

 

测试示例

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.util.LongAccumulator;

import java.util.Arrays;
import java.util.HashSet;

/**
 * spark内置了数值类型的累加器,比如LongAccumulator、DoubleAccumulator
 */
public class TestAccumulator {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf();
        SparkSession spark = SparkSession
                .builder()
                .appName("gxl")
                .master("local")
                .config(conf)
                .enableHiveSupport()
                .getOrCreate();

        JavaSparkContext jsc = JavaSparkContext.fromSparkContext(spark.sparkContext());
        testMyAccumulator(jsc);
        
    }

    private static void testMyAccumulator(JavaSparkContext jsc){
        MyAccumulator myAccumulator = new MyAccumulator();
        jsc.sc().register(myAccumulator,"myAccumulator");

        HashSet<String> blacklist = new HashSet<>();
        blacklist.add("jack");

        JavaRDD<String> stringJavaRDD = jsc.parallelize(Arrays.asList("jack", "kevin", "wade", "james"));
        JavaRDD<String> filter = stringJavaRDD.filter((Function<String, Boolean>) v1 -> {
            if (blacklist.contains(v1)) {
                return true;
            } else {
                myAccumulator.add(v1);
                return false;
            }
        });
        filter.count();
        System.out.println(myAccumulator.value());
    }

}

 

以上是关于spark Accumulator累加器使用示例的主要内容,如果未能解决你的问题,请参考以下文章

Accumulator

Spark Accumulators

生产常用Spark累加器剖析之二

Spark Accumulators

如何使我的 Spark Accumulator 统计信息在 Azure Databricks 中可靠?

Spark 自定义累加变量(Accmulator)AccumulatorParam