class Terrafying::Components::Prometheus
Attributes
prometheus[R]
security_group[R]
thanos[R]
Public Class Methods
create_in(options)
click to toggle source
# File lib/terrafying/components/prometheus.rb, line 12 def self.create_in(options) new(**options).tap(&:create) end
find_in(options)
click to toggle source
# File lib/terrafying/components/prometheus.rb, line 16 def self.find_in(options) new(**options).tap(&:find) end
new( vpc:, thanos_name: 'thanos', thanos_version: 'v0.18.0', prom_name: 'prometheus', prom_version: 'v2.25.0', instances: 2, instance_type: 't3a.small', thanos_instance_type: 't3a.small', prometheus_tsdb_retention: '1d', prometheus_data_dir: '/var/lib/prometheus', prometheus_data_size: 20, prometheus_additional_scrape_configs: [] )
click to toggle source
Calls superclass method
# File lib/terrafying/components/prometheus.rb, line 20 def initialize( vpc:, thanos_name: 'thanos', thanos_version: 'v0.18.0', prom_name: 'prometheus', prom_version: 'v2.25.0', instances: 2, instance_type: 't3a.small', thanos_instance_type: 't3a.small', prometheus_tsdb_retention: '1d', prometheus_data_dir: '/var/lib/prometheus', prometheus_data_size: 20, prometheus_additional_scrape_configs: [] ) super() @vpc = vpc @thanos_name = thanos_name @thanos_version = thanos_version @prom_name = prom_name @prom_version = prom_version @instances = instances @prometheus_instance_type = instance_type @thanos_instance_type = thanos_instance_type @prometheus_tsdb_retention = prometheus_tsdb_retention @prometheus_data_dir = prometheus_data_dir @prometheus_data_size = prometheus_data_size @prometheus_additional_scrape_configs = prometheus_additional_scrape_configs end
Public Instance Methods
cloudwatch_alarm(name, namespace, dimensions)
click to toggle source
# File lib/terrafying/components/prometheus.rb, line 361 def cloudwatch_alarm(name, namespace, dimensions) resource 'aws_cloudwatch_metric_alarm', name, alarm_name: name, comparison_operator: 'GreaterThanOrEqualToThreshold', evaluation_periods: '1', metric_name: 'UnHealthyHostCount', namespace: namespace, period: '180', threshold: '1', statistic: 'Minimum', alarm_description: "Monitoring #{name} target group host health", dimensions: dimensions, alarm_actions: ['arn:aws:sns:eu-west-1:136393635417:prometheus_cloudwatch_topic'], ok_actions: ['arn:aws:sns:eu-west-1:136393635417:prometheus_cloudwatch_topic'] end
create()
click to toggle source
# File lib/terrafying/components/prometheus.rb, line 56 def create prometheus_thanos_sidecar_hostname = tf_safe(@prom_name) prometheus_thanos_sidecar_srv_fqdn = "_grpc._tcp.#{@vpc.zone.qualify prometheus_thanos_sidecar_hostname}" @prometheus_instance_vcpu_count = aws.instance_type_vcpu_count(@prometheus_instance_type) @thanos_instance_vcpu_count = aws.instance_type_vcpu_count(@thanos_instance_type) @thanos = create_thanos(prometheus_thanos_sidecar_srv_fqdn) create_thanos_cloudwatch_alert(@thanos) @prometheus = create_prom @security_group = @prometheus.egress_security_group # Form SRV record with thanos-sidecars @vpc.zone.add_srv_in(self, prometheus_thanos_sidecar_hostname, 'grpc', 10_901, 'tcp', @prom_service.domain_names.drop(1)) # Allow Prometheus to scrape Thanos Query @thanos.used_by(@prometheus) { |port| port[:upstream_port] == 10_902 } # Allow Thanos Query instance to reach Prometheus running Thanos Sidecar @prometheus.used_by(@thanos) { |port| port[:upstream_port] == 10_901 } # Allow connections from VPC to Thanos Query services @thanos.used_by_cidr(@vpc.cidr) { |port| [10_902, 10_901].include? port[:upstream_port] } end
create_prom()
click to toggle source
# File lib/terrafying/components/prometheus.rb, line 79 def create_prom @prom_service = add! Terrafying::Components::Service.create_in( @vpc, @prom_name, ports: [ { type: 'tcp', number: 9090 }, { type: 'tcp', number: 10_902 }, { type: 'tcp', number: 10_901 } ], instance_type: @prometheus_instance_type, iam_policy_statements: thanos_store_access, instances: [{}] * @instances, units: [prometheus_unit, thanos_sidecar_unit], files: [prometheus_conf, thanos_bucket], volumes: [prometheus_data_volume], tags: { prometheus_port: 9090, prometheus_path: '/metrics', prometheus_port_0: 10_902, prometheus_path_0: '/metrics' } ) end
create_thanos(prometheus_thanos_sidecar_srv_fqdn)
click to toggle source
# File lib/terrafying/components/prometheus.rb, line 111 def create_thanos(prometheus_thanos_sidecar_srv_fqdn) @thanos_service = add! Terrafying::Components::Service.create_in( @vpc, @thanos_name, ports: [ { type: 'tcp', number: 10_902, health_check: { protocol: 'HTTP', path: '/-/healthy' } }, { type: 'tcp', number: 10_901, health_check: { protocol: 'TCP' } } ], instance_type: @thanos_instance_type, units: [thanos_unit(prometheus_thanos_sidecar_srv_fqdn)], instances: [{}] * @instances, loadbalancer: true, tags: { prometheus_port: 10_902, prometheus_path: '/metrics' } ) end
create_thanos_cloudwatch_alert(service)
click to toggle source
# File lib/terrafying/components/prometheus.rb, line 377 def create_thanos_cloudwatch_alert(service) service.load_balancer.targets.each_with_index do |target, i| cloudwatch_alarm "#{service.name}_#{i}", 'AWS/NetworkELB', LoadBalancer: output_of('aws_lb', service.load_balancer.name.gsub(%r{^(\d)}, '_\1'), 'arn_suffix'), TargetGroup: target.target_group.to_s.gsub(/id/, 'arn_suffix') end end
expose_in(vpc)
click to toggle source
# File lib/terrafying/components/prometheus.rb, line 351 def expose_in(vpc) @endpoint_service ||= @thanos.with_endpoint_service(acceptance_required: false) options = {} endpoint = add! @endpoint_service.expose_in(vpc, options) endpoint.used_by_cidr(vpc.cidr) endpoint end
find()
click to toggle source
# File lib/terrafying/components/prometheus.rb, line 49 def find @security_group = aws.security_group_in_vpc( @vpc.id, "staticset-#{@vpc.name}-#{@prom_name}" ) end
prometheus_conf()
click to toggle source
# File lib/terrafying/components/prometheus.rb, line 228 def prometheus_conf { path: '/opt/prometheus/prometheus.yml', mode: 0o644, contents: ERB.new(<<~'END', 0, '-', '_').result(binding) global: external_labels: monitor: prometheus cluster: <%= @vpc.name %> replica: {{HOST}} scrape_interval: 15s scrape_configs: # While AWS EC2 instance support up to 50 tags, we wouldn't be able # to fit such a long configuration file into user_data of the # instance; user_data is limited to just 16k. # This configuration support scraping up to 5 ports per instance: <%- prom_tag_name_suffixes = [''] + (0..3).map {|i| "_#{i}"} -%> <%- prom_tag_name_suffixes.each do |suffix| -%> - job_name: ec2<%= suffix %> params: format: ["prometheus"] ec2_sd_configs: - region: eu-west-1 filters: - name: vpc-id values: ["<%= @vpc.id %>"] - # by using the same ec2_sd_configs we are able to share single # provider instance thanks to SD configuration coalescing # therefore "prometheus_port" tag must always be present on # the instance to be discovered (i. e. "prometheus_port_*" tag # would not be sufficient if " " tag is missing) name: tag-key values: ["prometheus_port"] relabel_configs: - source_labels: [__meta_ec2_tag_prometheus_port<%= suffix %>] regex: (.+) action: keep - source_labels: [__meta_ec2_private_ip, __meta_ec2_tag_prometheus_port<%= suffix %>] target_label: __address__ separator: ':' - source_labels: [__meta_ec2_tag_prometheus_path<%= suffix %>] target_label: __metrics_path__ regex: (.+) - source_labels: [__meta_ec2_instance_id] target_label: instance_id - source_labels: [__meta_ec2_tag_envoy_cluster] target_label: envoy_cluster - source_labels: [__meta_ec2_tag_service_name] target_label: service_name <%- end -%> <%- @prometheus_additional_scrape_configs.each do |conf| -%> <%= conf %> <%- end -%> END } end
prometheus_data_volume()
click to toggle source
# File lib/terrafying/components/prometheus.rb, line 142 def prometheus_data_volume { name: 'prometheus_data', mount: @prometheus_data_dir, device: '/dev/xvdl', size: @prometheus_data_size, } end
prometheus_unit()
click to toggle source
# File lib/terrafying/components/prometheus.rb, line 152 def prometheus_unit { name: 'prometheus.service', contents: <<~PROM_UNIT [Install] WantedBy=multi-user.target [Unit] Description=Prometheus Service After=docker.service Requires=docker.service [Service] ExecStartPre=-/usr/bin/docker network create --driver bridge prom ExecStartPre=-/usr/bin/docker kill prometheus ExecStartPre=-/usr/bin/docker rm prometheus ExecStartPre=/usr/bin/docker pull quay.io/prometheus/prometheus:#{@prom_version} ExecStartPre=-/usr/bin/sed -i "s/{{HOST}}/%H/" /opt/prometheus/prometheus.yml ExecStartPre=/usr/bin/install -d -o nobody -g nobody -m 0755 #{@prometheus_data_dir} ExecStart=/usr/bin/docker run --name prometheus \ -p 9090:9090 \ --network=prom \ -v /opt/prometheus:/opt/prometheus \ -v #{@prometheus_data_dir}:/var/lib/prometheus \ quay.io/prometheus/prometheus:#{@prom_version} \ --storage.tsdb.path=/var/lib/prometheus/tsdb \ --storage.tsdb.retention.time=#{@prometheus_tsdb_retention} \ --storage.tsdb.min-block-duration=2h \ --storage.tsdb.max-block-duration=2h \ --storage.tsdb.no-lockfile \ --storage.remote.read-concurrent-limit=#{@prometheus_instance_vcpu_count} \ --query.max-concurrency=#{@prometheus_instance_vcpu_count} \ --config.file=/opt/prometheus/prometheus.yml \ --web.console.templates=/etc/prometheus/consoles \ --web.console.libraries=/etc/prometheus/console_libraries \ --web.enable-lifecycle \ --log.level=warn Restart=always RestartSec=30 PROM_UNIT } end
thanos_bucket()
click to toggle source
# File lib/terrafying/components/prometheus.rb, line 315 def thanos_bucket { path: '/opt/thanos/bucket.yml', mode: 0o644, contents: <<~S3CONF type: S3 config: bucket: uswitch-thanos-store endpoint: s3.eu-west-1.amazonaws.com S3CONF } end
thanos_sidecar_unit()
click to toggle source
# File lib/terrafying/components/prometheus.rb, line 195 def thanos_sidecar_unit { name: 'thanos.service', contents: <<~THANOS_SIDE [Install] WantedBy=multi-user.target [Unit] Description=Thanos Service After=docker.service prometheus.service Requires=docker.service prometheus.service [Service] ExecStartPre=-/usr/bin/docker kill thanos ExecStartPre=-/usr/bin/docker rm thanos ExecStartPre=/usr/bin/docker pull quay.io/thanos/thanos:#{@thanos_version} ExecStart=/usr/bin/docker run --name thanos \ -p 10901-10902:10901-10902 \ -v #{@prometheus_data_dir}:/var/lib/prometheus \ -v /opt/thanos:/opt/thanos \ --network=prom \ quay.io/thanos/thanos:#{@thanos_version} \ sidecar \ --prometheus.url=http://prometheus:9090 \ --tsdb.path=/var/lib/prometheus/tsdb \ --objstore.config-file=/opt/thanos/bucket.yml \ --log.level=warn Restart=always RestartSec=30 THANOS_SIDE } end
thanos_store_access()
click to toggle source
# File lib/terrafying/components/prometheus.rb, line 328 def thanos_store_access [ { Action: ['ec2:DescribeInstances'], Effect: 'Allow', Resource: '*' }, { Action: [ 's3:ListBucket', 's3:GetObject', 's3:DeleteObject', 's3:PutObject' ], Effect: 'Allow', Resource: [ 'arn:aws:s3:::uswitch-thanos-store/*', 'arn:aws:s3:::uswitch-thanos-store' ] } ] end
thanos_unit(prometheus_thanos_sidecar_srv_fqdn)
click to toggle source
# File lib/terrafying/components/prometheus.rb, line 285 def thanos_unit(prometheus_thanos_sidecar_srv_fqdn) { name: 'thanos.service', contents: <<~THANOS_UNIT [Install] WantedBy=multi-user.target [Unit] Description=Thanos Service After=docker.service Requires=docker.service [Service] ExecStartPre=-/usr/bin/docker kill thanos ExecStartPre=-/usr/bin/docker rm thanos ExecStartPre=/usr/bin/docker pull quay.io/thanos/thanos:#{@thanos_version} ExecStart=/usr/bin/docker run --name thanos \ -p 10901-10902:10901-10902 \ quay.io/thanos/thanos:#{@thanos_version} \ query \ --query.replica-label=replica \ --query.max-concurrent=#{@thanos_instance_vcpu_count} \ --store=dnssrv+#{prometheus_thanos_sidecar_srv_fqdn} \ --log.level=warn Restart=always RestartSec=30 THANOS_UNIT } end