# Start a cluster that needs a VPC
aws emr create-cluster --profile $KEY \
--name "Test Spark Cluster with VPC" \
--release-label emr-5.10.0 \
--applications Name=Hadoop Name=Spark \
--ec2-attributes KeyName=$KEY,SubnetId=subnet-xxxxxxx \
--instance-type r4.4xlarge \
--instance-count 3 \
--use-default-roles
# Important
- No space in Args=[arg1,arg2,arg3]
$JAR=/usr/lib/spark/lib/spark-examples.jar
$KEY=MoissinB
# Create cluster with 1st step
aws emr create-cluster --profile $KEY \
--name "My Cluster" \
--release-label emr-5.10.0 \
--applications Name=Hadoop,Name=Spark \
--ec2-attributes KeyName=$KEY \
--instance-type m3.xlarge \
--instance-count 3 \
--auto-terminate\ #Cluster terminate at the end of the last step
--steps Type=Spark,\
Name="Spark Program - Task 1", \
ActionOnFailure=CONTINUE,\
Args=[--class,main.scala.task.Task1,$JAR] \
--use-default-roles
# Add an execution step (run another class from the jar)
aws emr add-steps --cluster-id j-2AXXXXXXGAPLF \
--steps Type=Spark,\
Name="Spark Program",\
ActionOnFailure=CONTINUE,\
Args=[--class,org.apache.spark.examples.SparkPi,$JAR,10]
# EMR
- https://aws.amazon.com/blogs/aws/new-apache-spark-on-amazon-emr/
- https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-launch.html
# Instances
- https://aws.amazon.com/emr/pricing/
- https://aws.amazon.com/ec2/instance-types/
## Starting a cluster with VPC on EMR
Some instances with larger computational power need to be launched on a VPC (e.g. R4)
- http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/get-set-up-for-amazon-ec2.html#create-a-vpc
- http://docs.aws.amazon.com/emr/latest/DeveloperGuide/emr-clusters-in-a-vpc.html
- http://docs.aws.amazon.com/emr/latest/DeveloperGuide/emr-vpc-host-job-flows.html
- http://docs.aws.amazon.com/emr/latest/DeveloperGuide/emr-vpc-launching-job-flows.html