*.sink.graphite.class=org.apache.spark.metrics.sink.GraphiteSink
*.sink.graphite.host=[hostname]
*.sink.graphite.port=[port]
*.sink.graphite.prefix=some_meaningful_name
driver.source.jvm.class=org.apache.spark.metrics.source.JvmSource
executor.source.jvm.class=org.apache.spark.metrics.source.JvmSource
# special function to parse Spark metrics in graphite:
# for driver: aliasSub(stats.analytics.$job_name.*.prod.$dc.*.driver.jvm.heap.used, ".*(application_[0-9]+).*", "heap: \1")
# for execuors: aliasSub(groupByNode(stats.analytics.$job_name.*.prod.$dc.*.[0-9]*.jvm.heap.used, 6, "sumSeries"), "(.*)", "heap: \1")
spark-submit --master yarn --deploy-mode cluster # yarn cluster mode (driver in yarn)
--conf spark.yarn.maxAppAttempts=4 # default 2
--conf spark.yarn.am.attemptFailuresValidityInterval=1h # reset the count every hour (a streaming app can last months)
--conf spark.yarn.max.executor.failures={8 * num_executors} # default is max(2*num_executors, 3). So for 4 executors: 32 against 8.
--conf spark.yarn.executor.failuresValidityInterval=1h # same as before, but for the executors
--conf spark.task.maxFailures=8 # default is 4
--queue realtime_queue # do not mess with the default queue
--conf spark.speculation=true # ensure the job is idempotent (it will start the same job twice if the first is slow)
--conf spark.driver.extraJavaOptions=-Dlog4j.configuration=file:log4j.properties # eh, logging to some logstash for instance
--conf spark.executor.extraJavaOptions=-Dlog4j.configuration=file:log4j.properties
--files /path/to/log4j.properties:/path/to/metrics.properties