Taming The Elephant: Using a Complex Structure as a Spark Accumulator

In two earlier blogs I discussed the uses of Accumulators in Spark. I will continue this discussion by describing how Accumulators may be used for more complex structures. The structure I will describe keeps statistics on a double variable tracking mean, standard deviation, minimum and maximum values.
While the code was written in part to test the capabilities of accumulators, there were real problems motivating it. Accumulators are an excellent way to pull secondary, summary results from processing and RDD without interfering with the main result.
The Statistics class can accumulate statistics on data. The object is immutable. addition results in a new objects combining one or more number or one other Statistics. The code is shown below.

package com.lordjoe.distributed;
package com.lordjoe.distributed.spark;

import org.apache.spark.*;

import java.io.*;

/**
 * com.lordjoe.distributed.spark.Statistics
 * keep statistics
 * User: Steve
 * Date: 11/13/2014
 */
public class Statistics implements Serializable {

public static final StatisticsAccumulatorParam PARAM_INSTANCE = new StatisticsAccumulatorParam();

public static final Statistics ZERO = new Statistics();
    private final int number;
    private final double sum;
    private final double sumsquare;
    private final double max;
    private final double min;

private Statistics() {
        number = 0;
        sum = 0;
        sumsquare = 0;
        max = Double.MIN_VALUE;
        min = Double.MAX_VALUE;
     }

/**
     * build with 1 or more numbers
     * @param d  first value
     * @param values other values - if any
     */
    public Statistics(double d, double... values) {
        number = 1 + values.length;
        double tsum = d;
        double tsumsq = d * d;
        double tmin = d;
        double tsmax = d;
        for (int i = 0; i < values.length; i++) {
            double value = values[i];
            tsum += value;
            tsumsq += value * value;
            tmin = Math.max(value, d);
            tsmax = Math.min(value, d);
         }
         sum = tsum;
        sumsquare = tsumsq;
        max = tmin;
        min = tsmax;
    }

private Statistics(Statistics s1, Statistics s2) {
        number = s1.number + s2.number;
        sum = s1.sum + s2.sum;
        sumsquare = s1.sumsquare + s2.sumsquare;
        max = Math.max(s1.max, s2.max);
        min = Math.min(s1.min, s2.min);
    }

private Statistics(Statistics s1, double d) {
        number = s1.number + 1;
        sum = s1.sum + d;
        sumsquare = s1.sumsquare + d * d;
        max = Math.max(s1.max, d);
        min = Math.min(s1.min, d);
    }

public Statistics add(double d) {
        return new Statistics(this, d);
    }

public Statistics add(Statistics d) {
        return new Statistics(this, d);
    }

public int getNumber() {
        return number;
    }

public double getSum() {
        return sum;
    }

public double getSumsquare() {
        return sumsquare;
    }

public double getMax() {
        return max;
    }

public double getMin() {
        return min;
    }

public double getAverage() {
        if (number == 0)
            return 0;
        return sum / number;
    }

public double getStandardDeviation() {
        if (number < 2)
            return Double.MAX_VALUE;
           double variance = (sumsquare - sum * getAverage()) / (number - 1.0);
        return Math.sqrt(variance);
      }

/**
     *   class to make an  AccumulatorParam<Statistics> PARAM_INSTANCE exposes this
     */
    public static class StatisticsAccumulatorParam implements AccumulatorParam<statistics>, Serializable {
        // only use PARAM_INSTANCE
        private StatisticsAccumulatorParam() {}
        @Override
        public Statistics addAccumulator(final Statistics r, final Statistics t) {
              return r.add(t);
           }

@Override
        public Statistics addInPlace(final Statistics r, final Statistics t) {

return r.add(t);
        }

@Override
        public Statistics zero(final Statistics initialValue) {
            return Statistics.ZERO.add(initialValue);
        }
    }

}

The code may be used in two ways - to create an accumulator or directly as shown in the following
sample using combineByKey

To create an accumulator say
// Make an accumulators using Statistics
final Accumulator<Statistics> totalLetters = ctx.accumulator(Statistics.ZERO, "Total Letters ", Statistics.PARAM_INSTANCE);
// lines from word count
JavaRDD<String> lines = ctx.textFile(args[0], 4);

JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterable<String> call(final String s) throws Exception {
// Handle accumulator here
totalLetters.add((long)s.length()); // count all letters
... // other stuff
}
});
// more code
Statistics letterStatistics = totalLetters.value();
int numberLetters = letterStatistics.getNumber();
double averageLineLength = letterStatistics.getAverage();

When multiple keys are involved the same structure may be used in combineByKey to generate separate statistics for each key

package com.lordjoe.distributed;

import com.lordjoe.distributed.spark.*;
import org.apache.spark.*;
import org.apache.spark.api.java.*;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.*;
import scala.*;

import java.lang.Double;
import java.util.*;

/**
 * com.lordjoe.distributed.StatisticalTests
 * Demonstration of custom accumulators
 * User: Steve
 * Date: 9/2/2014
 */
public class StatisticalTests {

public static final Integer[] keys = {1, 2, 3, 4, 5, 6, 7, 8};
    public static final Random RND = new Random();

public static final int MIN_ENTRIES = 4000;
    public static final int MAX_ENTRIES = 10000;

/**
     * return a distribution with mean 4 * key
     * sd key
     * @param key
     * @return
     */
    private static double buildStats(final Integer key) {
        double v = RND.nextGaussian();
        v *= key;
        v += 4 * key;
        return v ;
    }

/**
     * Usage - args[0] is the name of a file to count words
     * like
     * RedBadge.txt
     *
     * @param args
     */
    public static void main(final String[] args) {

SparkConf sparkConf = new SparkConf();
        sparkConf.setAppName("StatisticalTests");

Option<string> option = sparkConf.getOption("spark.master");
        if (!option.isDefined()) {   // use local over nothing
            sparkConf.setMaster("local[*]");
        }
        JavaSparkContext ctx = new JavaSparkContext(sparkConf);

List<integer> keyList = Arrays.asList(keys);
        JavaRDD<integer> keys = ctx.parallelize(keyList);

/*
         * generate values - key = integer, value = gaussian(4 * key,key
         */
        JavaPairRDD<Integer, Double> dataWithStatistics = keys.flatMapToPair(new PairFlatMapFunction<Integer, Integer, Double>() {
            @Override
            public Iterable<Tuple2<Integer, Double>> call(final Integer key) throws Exception {
                List<Tuple2<Integer, Double>> holder = new ArrayList<Tuple2<Integer, Double>>();
                int numberEntries = MIN_ENTRIES + RND.nextInt(MAX_ENTRIES);
                for (int i = 0; i < numberEntries; i++) {
                    double stats = buildStats(key);
                    holder.add(new Tuple2<Integer, Double>(key,stats)) ;
                 }
                 return holder;
            }
        });

/*
             Create statistics bu using combinebyKey
         */
        JavaPairRDD<Integer, Statistics> generatedStatistics = dataWithStatistics.combineByKey(
                new Function<Double, Statistics>() {
                    @Override
                    public Statistics call(final Double start) throws Exception {
                        return new Statistics(start);
                    }
                }, new Function2<Statistics, Double, Statistics>() {
                    @Override
                    public Statistics call(final Statistics in, final Double added) throws Exception {
                        return in.add(added);
                    }
                },
                new Function2<Statistics, Statistics, Statistics>() {
                    @Override
                    public Statistics call(final Statistics in, final Statistics added) throws Exception {
                        return in.add(added);
                    }
                }

);

List<Tuple2<Integer, Statistics>> statistics = generatedStatistics.collect();

for (Tuple2<Integer, Statistics> statistic : statistics) {
            Integer key = statistic._1();
            Statistics value = statistic._2();
            System.out.println("key  = " + key);
            System.out.println("total values  = " + value.getNumber());
             System.out.println("average  = " + String.format("%10.2f", value.getAverage()));
            System.out.println("Sd  = " + String.format("%10.2f", value.getStandardDeviation()));
            System.out.println("Max  = " + value.getMax());
            System.out.println("Min  = " + value.getMin());
            System.out.println("================================================");
         }
     }
 }

All code for this article is available here.

Monday, November 17, 2014

Using a Complex Structure as a Spark Accumulator

No comments:

Post a Comment

About Me

Blog Archive