Initial commit

Tom · Tom · commit bfc680d0b070 · 2015-10-27T15:45:52.000Z
diff --git a/README.md b/README.md
@@ -0,0 +1,151 @@
+# Rancher host cluster Terraform module
+
+This is a terraform module to help with creating a rancher host cluster. It is intended for use in combination with [my Rancher server module](https://github.com/greensheep/terraform-aws-rancher-server).
+
+### Features
+
+- Flexible for use with different deployment scenarios.
+- Automatically adds hosts launched by autoscaling to the Rancher server.
+- Registers autoscaling lifecycle hook used to automatically remove instances from the Rancher server on scale down (see [my Rancher server module](https://github.com/greensheep/terraform-aws-rancher-server)).
+- Designed for use in VPC private subnets so can be used for private, backend services or proxy traffic from an ELB for public services.
+- Can be used unlimited times in a terraform config. Allows creation of separate clusters for dev, staging, production, etc.
+
+### Requirements
+
+Terraform 0.6.6 is required.
+
+On it's own this doesn't do very much. It needs to be included in a Terraform config that creates the following resources:
+
+- Security group
+- Autoscaling launch configuration
+- Autoscaling group
+
+Because these resources may vary significantly for your deployment (eg, the type of app you're deploying, expected workload, etc), you need to create these yourself and pass in the necessary variables.
+
+You'll also need to have your Rancher server setup & configured (did I mention [my Rancher server module](https://github.com/greensheep/terraform-aws-rancher-server)!). Don't be tempted to use this as part of some mega-config that also creates the server.. you need to specify an environment id and API access keys for it to work!
+
+### Usage
+
+Include the following in your existing terraform config:
+
+    module "staging_cluster" {
+
+        # Import the module from Github
+        # It's probably better to fork or clone this repo if you intend to use in production
+        # so any future changes dont mess up your existing infrastructure.
+        source = "github.com/greensheep/terraform-aws-rancher-hosts"
+
+        # Add Rancher server details
+        server_security_group_id = "sg-XXXXXXXX"
+        server_hostname          = "rancher-server.yourdomain.tld"
+
+        # Rancher environment
+        # In your Rancher server, create an environment and an API keypair. You can have
+        # multiple host clusters per environment if necessary. Instances will be labelled
+        # with the cluster name so you can differentiate between multiple clusters.
+        environment_id         = "1a7"
+        environment_access_key = "ACCESS-KEY"
+        environment_secret_key = "SECRET-KET"
+
+        # Name your cluster and provide the autoscaling group name and security group id.
+        # See examples below.
+        cluster_name                       = "${var.cluster_name}"
+        cluster_autoscaling_group_name     = "${aws_autoscaling_group.cluster_autoscale_group.id}"
+        cluster_instance_security_group_id = "${aws_security_group.rancher_host_sg.id}"
+
+        # Lifecycle hooks queue ARN
+        # This is specific to my Rancher server module which creates the SQS queue used to
+        # received autoscaling lifecycle hooks. This module creates a lifecycle hook for the
+        # provided autoscaling group so that instances can be removed from the Rancher
+        # server before they are terminated.
+        lifecycle_hooks_sqs_queue_arn = "${var.lifecycle_hooks_sqs_queue_arn}"
+
+    }
+
+### Examples of required resources
+
+##### Security group
+
+    # Cluster instance security group
+    resource "aws_security_group" "cluster_instance_sg" {
+
+        name = "Cluster-Instances"
+        description = "Rules for connected Rancher host machines. These are the hosts that run containers placed on the cluster."
+        vpc_id = "${TARGET-VPC-ID}"
+
+        # NOTE: To allow ELB proxied traffic to private VPC
+        #       hosts, open the necessary ports here..
+
+        lifecycle {
+            create_before_destroy = true
+        }
+
+    }
+
+
+##### Autoscaling
+
+    # Autoscaling launch configuration
+    resource "aws_launch_configuration" "cluster_launch_conf" {
+
+        name = "Launch-Config"
+
+        # Amazon linux, eu-west-1
+        image_id = "ami-69b9941e"
+
+        # No public ip when instances are placed in private subnets. See notes
+        # about creating an ELB to proxy public traffic into the cluster.
+        associate_public_ip_address = false
+
+        # Security groups
+        security_groups = [
+            "${aws_security_group.cluster_instance_sg.id}"
+        ]
+
+        # Key
+        # NOTE: It's a good idea to use the same key as the Rancher server here.
+        key_name = "${UPLOADED-KEY-NAME}"
+
+        # Add rendered userdata template
+        user_data = "${module.staging_cluster.host_user_data}"
+
+        # Misc
+        instance_type = "t2.micro"
+        enable_monitoring = true
+
+        lifecycle {
+            create_before_destroy = true
+        }
+
+    }
+
+    # Autoscaling group
+    resource "aws_autoscaling_group" "cluster_autoscale_group" {
+
+        name = "Cluster-ASG"
+        launch_configuration = "${aws_launch_configuration.cluster_launch_conf.name}"
+        min_size = "2"
+        max_size = "2"
+        desired_capacity = "2"
+        health_check_grace_period = 180
+        health_check_type = "EC2"
+        force_delete = false
+        termination_policies = ["OldestInstance"]
+
+        # Add ELB's here if you're proxying public traffic into the cluster
+        # load_balancers = ["${var.instance_cluster_load_balancers}"]
+
+        # Target subnets
+        vpc_zone_identifier = ["${LIST-OF-VPC-PRIVATE-SUBNET-IDS}"]
+
+        tag {
+            key = "Name"
+            value = "Test-Cluster-Instance"
+            propagate_at_launch = true
+        }
+
+        lifecycle {
+            create_before_destroy = true
+        }
+
+    }
diff --git a/auto-scaling.tf b/auto-scaling.tf
@@ -0,0 +1,47 @@
+# User-data template
+# Registers the instance with the rancher server environment
+resource "template_file" "user_data" {
+
+    filename = "${path.module}/files/userdata.template"
+    vars {
+        cluster_name           = "${var.cluster_name}"
+        environment_id         = "${var.environment_id}"
+        environment_access_key = "${var.environment_access_key}"
+        environment_secret_key = "${var.environment_secret_key}"
+        server_hostname        = "${var.server_hostname}"
+    }
+
+    lifecycle {
+        create_before_destroy = true
+    }
+
+}
+
+# Lifecycle hook
+# Triggered when an instance should be removed from the autoscaling
+# group. Publishes a message to the supplied SQS queue so that the host
+# can be removed from the Rancher server before shutting down.
+resource "aws_autoscaling_lifecycle_hook" "cluster_instance_terminating_hook" {
+
+    name = "cluster_instance_terminating_hook"
+    autoscaling_group_name = "${var.cluster_autoscaling_group_name}"
+    lifecycle_transition = "autoscaling:EC2_INSTANCE_TERMINATING"
+    default_result = "CONTINUE"
+
+    # 10 mins for rancher server to remove instance
+    heartbeat_timeout = 600
+
+    # Notification SQS queue
+    notification_target_arn = "${var.lifecycle_hooks_sqs_queue_arn}"
+
+    role_arn = "${aws_iam_role.lifecycle_role.arn}"
+
+    lifecycle {
+        create_before_destroy = true
+    }
+
+}
+
+output "host_user_data" {
+    value = "${template_file.user_data.rendered}"
+}
diff --git a/files/userdata.template b/files/userdata.template
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+# Install jq
+yum install -y jq
+
+# Install docker
+wget -qO- https://get.docker.com/ | sh
+service docker start
+
+# Setup initial vars
+serverUrl=https://${environment_access_key}:${environment_secret_key}@${server_hostname}
+projectId=${environment_id}
+
+# Make initial POST request for a registration token and record the id
+response=$(curl -s -X POST $serverUrl/v1/registrationtokens?projectId=$projectId)
+requestId=$(echo $response | jq -r '.id')
+requestState=$(echo $response | jq -r '.state')
+
+# The registration token request is async so keep checking until it's complete
+while [[ "$requestState" != "active" ]]; do
+	sleep 2
+	response=$(curl -s $serverUrl/v1/registrationtokens/$requestId)
+	requestState=$(echo $response | jq -r '.state')
+done
+
+# Get the instance id from metadata
+instanceId=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
+
+# Use the command in the response to start the rancher agent
+cmd=$(echo $response | jq -r '.command')
+eval $${cmd/sudo docker run /docker run -e CATTLE_HOST_LABELS=\"HOSTID=$instanceId&CLOUD=aws&CLUSTER=${cluster_name}\" }
diff --git a/iam.tf b/iam.tf
@@ -0,0 +1,41 @@
+# Autoscaling lifecycle hook role
+# Allows lifecycle hooks to add messages to the SQS queue
+resource "aws_iam_role" "lifecycle_role" {
+
+    name = "${var.cluster_name}-lifecycle-hooks"
+    assume_role_policy = <<EOF
+{
+  "Version": "2012-10-17",
+  "Statement": [
+    {
+      "Sid": "",
+      "Effect": "Allow",
+      "Principal": {
+        "Service": "autoscaling.amazonaws.com"
+      },
+      "Action": "sts:AssumeRole"
+    }
+  ]
+}
+EOF
+
+    lifecycle {
+        create_before_destroy = true
+    }
+
+}
+
+# AWS managed lifecycle hook policy
+resource "aws_iam_policy_attachment" "lifecycle_role_policy" {
+
+    name = "AutoScalingNotificationAccessRole"
+    policy_arn = "arn:aws:iam::aws:policy/service-role/AutoScalingNotificationAccessRole"
+    roles = [
+        "${aws_iam_role.lifecycle_role.name}"
+    ]
+
+    lifecycle {
+        create_before_destroy = true
+    }
+
+}
diff --git a/main.tf b/main.tf
@@ -0,0 +1,34 @@
+# Rancher server details
+variable "server_security_group_id" {
+    description = "Security group id of the Rancher server so we can restrict incoming traffic."
+}
+variable "server_hostname" {
+    description = "Hostname of the Rancher server."
+}
+
+# Target server environment
+variable "environment_id" {
+    description = "Target environment id for host registration."
+}
+variable "environment_access_key" {
+    description = "API access key for target environment"
+}
+variable "environment_secret_key" {
+    description = "API secret key for target environment"
+}
+
+# Cluster setup
+variable "cluster_name" {
+    description = "The name of the cluster. Best not to include non-alphanumeric characters. Will be used to name resources and tag instances."
+}
+variable "cluster_autoscaling_group_name" {
+    description = "Name of the target autoscaling group."
+}
+variable "cluster_instance_security_group_id" {
+    description = "ID of the security group used for host instances. Will be modified to include rancher specific rules."
+}
+
+# Lifecycle hooks queue arn
+variable "lifecycle_hooks_sqs_queue_arn" {
+    description = "ARN of the SQS queue used to receive autoscaling lifecycle hooks."
+}
diff --git a/security-groups.tf b/security-groups.tf
@@ -0,0 +1,83 @@
+# Attach IPSEC rules to host instance security group.
+# Enables the rancher overlay network for connected hosts.
+# Traffic only allowed from other machines with this security group.
+resource "aws_security_group_rule" "ipsec_ingress_1" {
+
+    security_group_id = "${var.cluster_instance_security_group_id}"
+    type = "ingress"
+    from_port = 4500
+    to_port = 4500
+    protocol = "udp"
+    source_security_group_id = "${var.cluster_instance_security_group_id}"
+
+    lifecycle {
+        create_before_destroy = true
+    }
+
+}
+
+resource "aws_security_group_rule" "ipsec_ingress_2" {
+
+    security_group_id = "${var.cluster_instance_security_group_id}"
+    type = "ingress"
+    from_port = 500
+    to_port = 500
+    protocol = "udp"
+    source_security_group_id = "${var.cluster_instance_security_group_id}"
+
+    lifecycle {
+        create_before_destroy = true
+    }
+
+}
+
+# SSH ingress
+# Required for the server to connect & configure the host.
+resource "aws_security_group_rule" "ssh_ingress" {
+
+    security_group_id = "${var.cluster_instance_security_group_id}"
+    type = "ingress"
+    from_port = 22
+    to_port = 22
+    protocol = "tcp"
+    source_security_group_id = "${var.server_security_group_id}"
+
+    lifecycle {
+        create_before_destroy = true
+    }
+
+}
+
+# Outgoing HTTP
+# Allows pulling of remote docker images, installing packages, etc.
+resource "aws_security_group_rule" "http_egress" {
+
+    security_group_id = "${var.cluster_instance_security_group_id}"
+    type = "egress"
+    from_port = 80
+    to_port = 80
+    protocol = "tcp"
+    cidr_blocks = ["0.0.0.0/0"]
+
+    lifecycle {
+        create_before_destroy = true
+    }
+
+}
+
+# Outgoing HTTPS
+# Allows pulling of remote docker images, installing packages, etc.
+resource "aws_security_group_rule" "https_egress" {
+
+    security_group_id = "${var.cluster_instance_security_group_id}"
+    type = "egress"
+    from_port = 443
+    to_port = 443
+    protocol = "tcp"
+    cidr_blocks = ["0.0.0.0/0"]
+
+    lifecycle {
+        create_before_destroy = true
+    }
+
+}