diff --git a/Dockerfile b/Dockerfile index 95558dc..e07448f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -19,7 +19,7 @@ RUN /usr/sbin/enable_insecure_key # Latest version ENV GRAFANA_VERSION 3.1.1-1470047149 ENV INFLUXDB_VERSION 1.0.2 -ENV TELEGRAF_VERSION 1.0.1 +ENV TELEGRAF_VERSION 1.1.2 RUN apt-get -y update && \ apt-get -y install \ @@ -90,7 +90,7 @@ RUN curl -s -o /tmp/telegraf_latest_amd64.deb https://dl.influxdata.com/tele dpkg -i /tmp/telegraf_latest_amd64.deb && \ rm /tmp/telegraf_latest_amd64.deb -ADD docker/telegraf/telegraf.conf /etc/telegraf/telegraf.conf +ADD docker/telegraf/telegraf.toml /etc/telegraf/telegraf.conf RUN mkdir /etc/service/telegraf ADD docker/telegraf/telegraf.launcher.sh /etc/service/telegraf/run diff --git a/dashboards/data_steaming_collector_dashboard.json b/dashboards/data_steaming_collector_dashboard.json index c819117..77f920d 100644 --- a/dashboards/data_steaming_collector_dashboard.json +++ b/dashboards/data_steaming_collector_dashboard.json @@ -96,6 +96,7 @@ "targets": [ { "column": "value", + "alias": "", "dsType": "influxdb", "interval": "", "query": "SELECT mean(\"value\") FROM \"jnpr.analyticsd\" WHERE (\"type\" = 'traffic-stats.rxpps' OR \"type\" = 'traffic-stats.txpps') AND \"device\" =~ /$host_regex/ AND \"interface\" =~ /$interface$/ AND $timeFilter GROUP BY \"device\", \"interface\", \"type\", time($GroupBy)", @@ -106,6 +107,7 @@ } ,{ "column": "value", + "alias": "", "dsType": "influxdb", "interval": "", "query": "SELECT derivative(mean(\"value\"), 1s) FROM \"jnpr.jvision\" WHERE (\"type\" = 'egress_stats.if_packets' OR \"type\" = 'ingress_stats.if_packets') AND \"device\" =~ /$host_regex$/ AND \"interface\" =~ /$interface$/ AND $timeFilter GROUP BY \"device\", \"interface\", \"type\", time($GroupBy)", @@ -116,6 +118,7 @@ } ,{ "column": "value", + "alias": "", "dsType": "influxdb", "interval": "", "query": "SELECT derivative(mean(\"value\"), 1s) FROM \"jnpr.jvision\" WHERE (\"type\" = 'egress_stats.if_pkts' OR \"type\" = 'ingress_stats.if_pkts') AND \"device\" =~ /$host_regex$/ AND \"interface\" =~ /$interface$/ AND $timeFilter GROUP BY \"device\", \"interface\", \"type\", time($GroupBy)", @@ -126,6 +129,7 @@ } ,{ "column": "value", + "alias": "", "dsType": "influxdb", "interval": "", "query": "SELECT derivative(mean(\"counters/out-packets\"), 1s) AS \"OUT\", derivative(mean(\"counters/in-packets\"), 1s) AS \"IN\" FROM \"openconfig-ifd\" WHERE \"device\" =~ /$host_regex$/ AND \"interface\" =~ /$interface$/ AND $timeFilter GROUP BY interface, device, time($GroupBy)", @@ -136,6 +140,7 @@ } ,{ "column": "value", + "alias": "", "dsType": "influxdb", "interval": "", "query": "SELECT derivative(mean(\"value\"), 1s) FROM \"jnpr.jvision\" WHERE (\"type\" = 'egress_stats.if_fc_stats.if_packets' OR \"type\" = 'ingress_stats.if_fc_stats.if_packets') AND \"device\" =~ /$host_regex$/ AND \"interface\" =~ /$interface$/ AND $timeFilter GROUP BY \"device\", \"interface\", \"type\", \"family\", \"forwarding_class\", time($GroupBy)", @@ -237,6 +242,7 @@ "targets": [ { "column": "value", + "alias": "", "dsType": "influxdb", "interval": "", "query": "SELECT mean(\"value\") FROM \"jnpr.analyticsd\" WHERE \"type\" = 'traffic-stats.txbps' AND \"device\" =~ /$host_regex/ AND \"interface\" =~ /$interface$/ AND $timeFilter GROUP BY \"device\", \"interface\", \"type\", time($GroupBy)", @@ -247,6 +253,7 @@ } ,{ "column": "value", + "alias": "", "dsType": "influxdb", "interval": "", "query": "SELECT derivative(mean(\"value\"), 1s)*8 FROM \"jnpr.jvision\" WHERE (\"type\" = 'egress_stats.if_octets' OR \"type\" = 'ingress_stats.if_octets') AND \"device\" =~ /$host_regex$/ AND \"interface\" =~ /$interface$/ AND $timeFilter GROUP BY \"device\", \"interface\", \"type\", time($GroupBy)", @@ -257,6 +264,7 @@ } ,{ "column": "value", + "alias": "", "dsType": "influxdb", "interval": "", "query": "SELECT mean(\"value\") FROM \"jnpr.analyticsd\" WHERE \"type\" = 'traffic-stats.rxbps' AND \"device\" =~ /$host_regex/ AND \"interface\" =~ /$interface$/ AND $timeFilter GROUP BY \"device\", \"interface\", \"type\", time($GroupBy)", @@ -267,6 +275,7 @@ } ,{ "column": "value", + "alias": "", "dsType": "influxdb", "interval": "", "query": "SELECT derivative(mean(\"counters/out-octets\"), 1s)*8 AS \"OUT\", derivative(mean(\"counters/in-octets\"), 1s)*8 AS \"IN\" FROM \"openconfig-ifd\" WHERE \"device\" =~ /$host_regex$/ AND \"interface\" =~ /$interface$/ AND $timeFilter GROUP BY interface, device, time($GroupBy)", @@ -277,6 +286,7 @@ } ,{ "column": "value", + "alias": "", "dsType": "influxdb", "interval": "", "query": "SELECT derivative(mean(\"value\"), 1s)*8 FROM \"jnpr.jvision\" WHERE (\"type\" = 'egress_stats.if_fc_stats.if_octets' OR \"type\" = 'ingress_stats.if_fc_stats.if_octets') AND \"device\" =~ /$host_regex$/ AND \"interface\" =~ /$interface$/ AND $timeFilter GROUP BY \"device\", \"interface\", \"type\", \"family\", \"forwarding_class\", time($GroupBy)", @@ -363,6 +373,7 @@ "targets": [ { "column": "value", + "alias": "", "dsType": "influxdb", "interval": "", "query": "SELECT derivative(mean(\"value\"), 1s) FROM \"jnpr.jvision\" WHERE \"type\" = 'egress_queue_info.packets' AND \"device\" =~ /$host_regex$/ AND \"interface\" =~ /$interface$/ AND $timeFilter GROUP BY \"device\", \"interface\", \"egress_queue\", time($GroupBy)", @@ -373,6 +384,7 @@ } ,{ "column": "value", + "alias": "", "dsType": "influxdb", "interval": "", "query": "SELECT derivative(mean(\"counters/out-discards\"), 1s) AS \"Discard out\" FROM \"openconfig-ifd\" WHERE \"device\" =~ /$host_regex$/ AND \"interface\" =~ /$interface$/ AND $timeFilter GROUP BY interface, device, time($GroupBy)", @@ -474,6 +486,7 @@ "targets": [ { "column": "value", + "alias": "", "dsType": "influxdb", "interval": "", "query": "SELECT derivative(\"value\", 1s) FROM \"jnpr.analyticsd\" WHERE ( \"type\" = 'traffic-stats.txmcpkt' OR \"type\" = 'traffic-stats.txucpkt' OR \"type\" = 'traffic-stats.txmcpkt' ) AND \"device\" =~ /$host_regex/ AND \"interface\" =~ /$interface$/ AND $timeFilter GROUP BY \"device\", \"interface\", \"type\"", @@ -484,6 +497,7 @@ } ,{ "column": "value", + "alias": "", "dsType": "influxdb", "interval": "", "query": "SELECT derivative(mean(\"value\"), 1s) FROM \"jnpr.jvision\" WHERE (\"type\" = 'egress_stats.if_uc_pkts' OR \"type\" = 'egress_stats.if_bc_pkts' OR \"type\" = 'ingress_stats.if_mc_pkts') AND \"device\" =~ /$host_regex$/ AND \"interface\" =~ /$interface$/ AND $timeFilter GROUP BY \"device\", \"interface\", \"type\", time($GroupBy)", @@ -494,6 +508,7 @@ } ,{ "column": "value", + "alias": "", "dsType": "influxdb", "interval": "", "query": "SELECT derivative(\"value\", 1s) FROM \"jnpr.analyticsd\" WHERE ( \"type\" = 'traffic-stats.rxmcpkt' OR \"type\" = 'traffic-stats.rxbcpkt' OR \"type\" = 'traffic-stats.rxucpkt' ) AND \"device\" =~ /$host_regex/ AND \"interface\" =~ /$interface$/ AND $timeFilter GROUP BY \"device\", \"interface\", \"type\"", @@ -504,6 +519,7 @@ } ,{ "column": "value", + "alias": "", "dsType": "influxdb", "interval": "", "query": "SELECT derivative(mean(\"counters/out-unicast-pkts\"), 1s) AS \"Ucast out\", derivative(mean(\"counters/in-unicast-pkts\"), 1s) AS \"Ucast in\" FROM \"openconfig-ifd\" WHERE \"device\" =~ /$host_regex$/ AND \"interface\" =~ /$interface$/ AND $timeFilter GROUP BY interface, device, time($GroupBy)", @@ -514,6 +530,7 @@ } ,{ "column": "value", + "alias": "", "dsType": "influxdb", "interval": "", "query": "SELECT derivative(mean(\"counters/out-broadcast-pkts\"), 1s) AS \"Bcast out\", derivative(mean(\"counters/in-broadcast-pkts\"), 1s) AS \"Bcast in\" FROM \"openconfig-ifd\" WHERE \"device\" =~ /$host_regex$/ AND \"interface\" =~ /$interface$/ AND $timeFilter GROUP BY interface, device, time($GroupBy)", @@ -524,6 +541,7 @@ } ,{ "column": "value", + "alias": "", "dsType": "influxdb", "interval": "", "query": "SELECT derivative(mean(\"counters/out-multicast-pkts\"), 1s) AS \"Mcast out\", derivative(mean(\"counters/in-multicast-pkts\"), 1s) AS \"Mcast in\" FROM \"openconfig-ifd\" WHERE \"device\" =~ /$host_regex$/ AND \"interface\" =~ /$interface$/ AND $timeFilter GROUP BY interface, device, time($GroupBy)", @@ -623,6 +641,7 @@ "targets": [ { "column": "value", + "alias": "", "dsType": "influxdb", "interval": "", "query": "SELECT derivative(mean(\"value\"), 1s) FROM \"jnpr.analyticsd\" WHERE (\"type\" = 'traffic-stats.rxcrcerr' OR \"type\" = 'traffic-stats.rxdroppkt') AND \"device\" =~ /$host_regex/ AND \"interface\" =~ /$interface$/ AND $timeFilter GROUP BY \"device\", \"interface\", \"type\", time($GroupBy)", @@ -633,6 +652,7 @@ } ,{ "column": "value", + "alias": "", "dsType": "influxdb", "interval": "", "query": "SELECT derivative(mean(\"value\"), 1s) FROM \"jnpr.jvision\" WHERE (\"type\" = 'ingress_errors.if_in_discards' OR \"type\" = 'ingress_errors.if_in_errors' OR \"type\" = 'ingress_errors.if_in_frame_errors' OR \"type\" = 'ingress_errors.if_in_l2_mismatch_timeouts' OR \"type\" = 'ingress_errors.if_in_l2chan_errors' OR \"type\" = 'ingress_errors.if_in_l3_incompletes' OR \"type\" = 'ingress_errors.if_in_qdrops' OR \"type\" = 'ingress_errors.if_in_resource_errors' OR \"type\" = 'ingress_errors.if_in_runts') AND \"device\" =~ /$host_regex$/ AND \"interface\" =~ /$interface$/ AND $timeFilter GROUP BY \"device\", \"interface\", \"type\", time($GroupBy)", @@ -734,6 +754,7 @@ "targets": [ { "column": "value", + "alias": "", "dsType": "influxdb", "interval": "", "query": "SELECT \"value\" FROM \"jnpr.analyticsd\" WHERE \"type\" = 'queue-stats.latency' AND \"device\" =~ /$host_regex$/ AND \"interface\" =~ /$interface$/ AND $timeFilter GROUP BY \"interface\", \"device\"", @@ -822,6 +843,7 @@ "targets": [ { "column": "value", + "alias": "", "dsType": "influxdb", "interval": "", "query": "SELECT mean(\"value\") FROM \"jnpr.analyticsd\" WHERE \"type\" = 'queue-stats.queue-depth' AND \"device\" =~ /$host_regex$/ AND \"interface\" =~ /$interface$/ AND $timeFilter GROUP BY time($interval), \"device\", \"interface\"", @@ -832,6 +854,7 @@ } ,{ "column": "value", + "alias": "", "dsType": "influxdb", "interval": "", "query": "SELECT \"value\" FROM \"jnpr.jvision\" WHERE \"type\" = 'egress_queue_info.cur_buffer_occupancy' AND \"device\" =~ /$host_regex$/ AND \"interface\" =~ /$interface$/ AND $timeFilter GROUP BY \"device\", \"interface\"", @@ -933,6 +956,7 @@ "targets": [ { "column": "value", + "alias": "", "dsType": "influxdb", "interval": "", "query": "SELECT \"value\"*8 FROM \"jnpr.jvision\" WHERE \"type\" = 'lspstata.byte_rate' AND \"device\" =~ /$host_regex$/ AND $timeFilter GROUP BY \"device\", \"lspname\", \"type\"", @@ -1019,6 +1043,7 @@ "targets": [ { "column": "value", + "alias": "", "dsType": "influxdb", "interval": "", "query": "SELECT \"value\" FROM \"jnpr.jvision\" WHERE \"type\" = 'lspstata.packet_rate' AND \"device\" =~ /$host_regex$/ AND $timeFilter GROUP BY \"device\", \"lspname\"", @@ -1118,6 +1143,7 @@ "targets": [ { "column": "value", + "alias": "", "dsType": "influxdb", "interval": "", "query": "SELECT derivative(mean(\"value\"), 1s) FROM \"jnpr.jvision\" WHERE \"type\" = 'filter_counter.bytes' AND \"device\" =~ /$host_regex$/ AND $timeFilter GROUP BY \"device\", \"interface\", \"filter_counter_name\", time($GroupBy)", @@ -1206,6 +1232,7 @@ "targets": [ { "column": "value", + "alias": "", "dsType": "influxdb", "interval": "", "query": "SELECT derivative(mean(\"value\"), 1s) FROM \"jnpr.jvision\" WHERE \"type\" = 'filter_counter.packets' AND \"device\" =~ /$host_regex$/ AND $timeFilter GROUP BY \"device\", \"interface\", \"filter_counter_name\", time($GroupBy)", @@ -1294,6 +1321,7 @@ "targets": [ { "column": "value", + "alias": "", "dsType": "influxdb", "interval": "", "query": "SELECT mean(\"value\") FROM \"jnpr.jvision\" WHERE \"type\" = 'memory_usage.HEAP' AND \"device\" =~ /$host_regex$/ AND $timeFilter GROUP BY \"device\", \"interface\", \"filter_name\", time($GroupBy)", @@ -1395,6 +1423,7 @@ "targets": [ { "column": "value", + "alias": "", "dsType": "influxdb", "interval": "", "query": "SELECT mean(\"value\") FROM \"jnpr.jvision\" WHERE \"type\" = 'cpu_mem.bytes_allocated' AND \"device\" =~ /$host_regex$/ AND $timeFilter GROUP BY \"device\", \"interface\", time($GroupBy)", @@ -1483,6 +1512,7 @@ "targets": [ { "column": "value", + "alias": "", "dsType": "influxdb", "interval": "", "query": "SELECT mean(\"value\") FROM \"jnpr.jvision\" WHERE \"type\" = 'cpu_mem.size' AND \"device\" =~ /$host_regex$/ AND $timeFilter GROUP BY \"device\", \"interface\", time($GroupBy)", @@ -1571,6 +1601,7 @@ "targets": [ { "column": "value", + "alias": "", "dsType": "influxdb", "interval": "", "query": "SELECT mean(\"value\") FROM \"jnpr.jvision\" WHERE \"type\" = 'cpu_mem.utilization' AND \"device\" =~ /$host_regex$/ AND $timeFilter GROUP BY \"device\", \"interface\", time($GroupBy)", diff --git a/dashboards/open_nti_internal_monitoring.json b/dashboards/open_nti_internal_monitoring.json new file mode 100644 index 0000000..71bd22f --- /dev/null +++ b/dashboards/open_nti_internal_monitoring.json @@ -0,0 +1,416 @@ +{ + "id": null, + "title": "Open NTI Internal Monitoring", + "originalTitle": "Open NTI Internal Monitoring", +"tags": [ + "opennti","internal" + ], + "style": "dark", + "timezone": "browser", + "editable": true, + "hideControls": false, + "sharedCrosshair": false, + "rows": [ + { + "collapse": true, + "editable": true, + "height": "300px", + "panels": [ + + { + "aliasColors": {}, + "bars": false, + "datasource": "opennti_internal", + "editable": true, + "error": false, + "fill": 0, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 1, + "legend": { + + "avg": true, + "current": true, + "max": false, + "min": false, + "show": true, + "total": false, + "alignAsTable": true, + "values": false + + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 1, + "points": true, + "renderer": "flot", + "seriesOverrides": [ + + ], + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ +{ + "column": "value", + "alias": "($tag_role) $tag_host cpu.idle", + "dsType": "influxdb", + "interval": "", + "query": "SELECT mean(\"usage_idle\") FROM \"cpu\" WHERE \"cpu\" = 'cpu-total' AND $timeFilter GROUP BY time($interval), \"host\", \"role\"", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "policy": "default" + } + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Idle", + "tooltip": { + "shared": false, + "value_type": "cumulative", + "msResolution": false + }, + "type": "graph", + "yaxes": [ + + + + + { + "show": true, + "min": null, + "max": null, + "logBase": 1, + "format": "short", + "label": "" + } +, + + { + "show": true, + "min": null, + "max": null, + "logBase": 1, + "format": "short", + "label": "" + } + + + ], + "xaxis": { + "show": true + } +},{ + "aliasColors": {}, + "bars": false, + "datasource": "opennti_internal", + "editable": true, + "error": false, + "fill": 0, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 2, + "legend": { + + "avg": true, + "current": true, + "max": false, + "min": false, + "show": true, + "total": false, + "alignAsTable": true, + "values": false + + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 1, + "points": true, + "renderer": "flot", + "seriesOverrides": [ + + ], + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ +{ + "column": "value", + "alias": "($tag_role) $tag_host mem.used", + "dsType": "influxdb", + "interval": "", + "query": "SELECT mean(\"used_percent\") FROM \"mem\" WHERE $timeFilter GROUP BY time($interval), \"host\", \"role\"", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "policy": "default" + } + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Used", + "tooltip": { + "shared": false, + "value_type": "cumulative", + "msResolution": false + }, + "type": "graph", + "yaxes": [ + + + + + { + "show": true, + "min": null, + "max": null, + "logBase": 1, + "format": "short", + "label": "" + } +, + + { + "show": true, + "min": null, + "max": null, + "logBase": 1, + "format": "short", + "label": "" + } + + + ], + "xaxis": { + "show": true + } +},{ + "aliasColors": {}, + "bars": false, + "datasource": "opennti_internal", + "editable": true, + "error": false, + "fill": 0, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 3, + "legend": { + + "avg": true, + "current": true, + "max": false, + "min": false, + "show": true, + "total": false, + "alignAsTable": true, + "values": false + + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 1, + "points": true, + "renderer": "flot", + "seriesOverrides": [ + + { + "alias": "/net.out/i", + "transform": "negative-Y" + } + + ], + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ +{ + "column": "value", + "alias": "($tag_role) $tag_host net.in", + "dsType": "influxdb", + "interval": "", + "query": "SELECT non_negative_derivative(mean(\"bytes_recv\"), 1s) FROM \"net\" WHERE \"interface\" = 'eth0' AND $timeFilter GROUP BY time($interval), \"host\", \"role\"", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "policy": "default" + } +,{ + "column": "value", + "alias": "($tag_role) $tag_host net.out", + "dsType": "influxdb", + "interval": "", + "query": "SELECT non_negative_derivative(mean(\"bytes_sent\"), 1s) FROM \"net\" WHERE \"interface\" = 'eth0' AND $timeFilter GROUP BY time($interval), \"host\", \"role\"", + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "policy": "default" + } + + ], + "timeFrom": null, + "timeShift": null, + "title": "Network", + "tooltip": { + "shared": false, + "value_type": "cumulative", + "msResolution": false + }, + "type": "graph", + "yaxes": [ + + + { + "show": true, + "min": null, + "max": null, + "logBase": 1, + "format": "bps", + "label": "RX / TX" + } +, + + { + "show": true, + "min": null, + "max": null, + "logBase": 1, + "format": "short", + "label": "" + } + + + ], + "xaxis": { + "show": true + } +} + + ], + + "showTitle": true, + "title": "CPU / Memory / Net" + +} + ], + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "collapse": false, + "enable": true, + "notice": false, + "now": true, + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "status": "Stable", + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ], + "type": "timepicker" + }, + + "templating": { + "enable": true, + "list": [ + { + "type": "interval", + "datasource": null, + "refresh": 1, + "name": "GroupBy", + "hide": 0, + "options": [ + { + "text": "auto", + "value": "$__auto_interval", + "selected": true + }, + { + "text": "1s", + "value": "1s", + "selected": false + }, + { + "text": "10s", + "value": "10s", + "selected": false + }, + { + "text": "30s", + "value": "30s", + "selected": false + }, + { + "text": "1m", + "value": "1m", + "selected": false + }, + { + "text": "5m", + "value": "5m", + "selected": false + } + ], + "includeAll": false, + "multi": false, + "query": "1s,10s,30s,1m,5m", + "current": { + "text": "auto", + "value": "$__auto_interval" + }, + "regex": "", + "auto": true, + "auto_count": 100, + "auto_min": "1s" + } + ] + }, + + + "refresh": "1m", + "schemaVersion": 12, + "version": 0, + "links": [] +} \ No newline at end of file diff --git a/dashboards/opennti_internal.yaml b/dashboards/opennti_internal.yaml new file mode 100644 index 0000000..aee2af2 --- /dev/null +++ b/dashboards/opennti_internal.yaml @@ -0,0 +1,13 @@ + +title: Open NTI Internal Monitoring +template: "dashboard_base.j2" + +tags: + - opennti + - internal + +templatings: + - group_by.yaml + +rows: + - internal/cpumem.yaml diff --git a/dashboards/templates/graphs/graph-lines-02.j2 b/dashboards/templates/graphs/graph-lines-02.j2 index 8e69c9e..55eec22 100644 --- a/dashboards/templates/graphs/graph-lines-02.j2 +++ b/dashboards/templates/graphs/graph-lines-02.j2 @@ -13,13 +13,25 @@ }, "id": {{ id }}, "legend": { +{% if legend is defined %} + "avg": {{ legend.avg | default("false") }}, + "current": {{ legend.current | default("false") }}, + "max": {{ legend.max | default("false") }}, + "min": {{ legend.min | default("false") }}, + "show": true, + "total": {{ legend.total | default("false") }}, + "alignAsTable": {{ legend.table | default("false") }}, + "values": {{ legend.value | default("false") }} +{% else %} "avg": false, "current": false, "max": false, "min": false, "show": false, + "alignAsTable": false, "total": false, "values": false +{% endif%} }, "lines": true, "linewidth": 1, @@ -46,6 +58,7 @@ {% for id, target in targets.iteritems() -%} { "column": "value", + "alias": "{{ target.alias | default("") }}", "dsType": "influxdb", "interval": "", "query": "{{ target.query|replace('"', '\\"')|trim }}", diff --git a/dashboards/templates/graphs/internal/cpu.yaml b/dashboards/templates/graphs/internal/cpu.yaml new file mode 100644 index 0000000..c1e559c --- /dev/null +++ b/dashboards/templates/graphs/internal/cpu.yaml @@ -0,0 +1,22 @@ + +template: graph-lines-02.j2 +datasource: opennti_internal +title: "CPU Idle" +span: 4 + +targets: + A: + query: > + SELECT mean("usage_idle") + FROM "cpu" + WHERE "cpu" = 'cpu-total' AND $timeFilter + GROUP BY time($interval), "host", "role" + alias: ($tag_role) $tag_host cpu.idle + +legend: + avg: "true" + current: "true" + table: "true" + +templatings_used: + - group_by.yaml diff --git a/dashboards/templates/graphs/internal/mem.yaml b/dashboards/templates/graphs/internal/mem.yaml new file mode 100644 index 0000000..39479db --- /dev/null +++ b/dashboards/templates/graphs/internal/mem.yaml @@ -0,0 +1,23 @@ + +template: graph-lines-02.j2 +datasource: opennti_internal +title: "Memory Used" +span: 4 + +targets: + A: + query: > + SELECT mean("used_percent") + FROM "mem" + WHERE $timeFilter + GROUP BY time($interval), "host", "role" + alias: ($tag_role) $tag_host mem.used + + +legend: + avg: "true" + current: "true" + table: "true" + +templatings_used: + - group_by.yaml diff --git a/dashboards/templates/graphs/internal/net.yaml b/dashboards/templates/graphs/internal/net.yaml new file mode 100644 index 0000000..60af0dd --- /dev/null +++ b/dashboards/templates/graphs/internal/net.yaml @@ -0,0 +1,38 @@ + +template: graph-lines-02.j2 +datasource: opennti_internal +title: "Network" +span: 4 + +targets: + A: + query: > + SELECT non_negative_derivative(mean("bytes_recv"), 1s) + FROM "net" + WHERE "interface" = 'eth0' AND $timeFilter + GROUP BY time($interval), "host", "role" + alias: ($tag_role) $tag_host net.in + B: + query: > + SELECT non_negative_derivative(mean("bytes_sent"), 1s) + FROM "net" + WHERE "interface" = 'eth0' AND $timeFilter + GROUP BY time($interval), "host", "role" + alias: ($tag_role) $tag_host net.out + +templatings_used: + - group_by.yaml + +legend: + avg: "true" + current: "true" + table: "true" + +yaxes: + - format: bps + label: RX / TX + - format: short + +series_overrides: + - alias: "/net.out/i" + transform: "negative-Y" diff --git a/dashboards/templates/rows/internal/cpumem.yaml b/dashboards/templates/rows/internal/cpumem.yaml new file mode 100644 index 0000000..a5dccd6 --- /dev/null +++ b/dashboards/templates/rows/internal/cpumem.yaml @@ -0,0 +1,9 @@ + +title: CPU / Memory / Net +height: 300px +template: row_base.j2 +panels: + graphs: + - internal/cpu.yaml + - internal/mem.yaml + - internal/net.yaml diff --git a/docker/grafana/grafana.init.sh b/docker/grafana/grafana.init.sh index e4a7296..2a720be 100644 --- a/docker/grafana/grafana.init.sh +++ b/docker/grafana/grafana.init.sh @@ -25,6 +25,10 @@ function waitAndConfigureGrafana -X POST -H 'Content-Type: application/json;charset=UTF-8' \ --data-binary '{"name":"influxdb","type":"influxdb","access":"proxy","url":"http://localhost:8086","password":"juniper","user":"juniper","database":"juniper","basicAuth":false,"isDefault":true}' + curl 'http://admin:admin@localhost:3000/api/datasources' \ + -X POST -H 'Content-Type: application/json;charset=UTF-8' \ + --data-binary '{"name":"opennti_internal","type":"influxdb","access":"proxy","url":"http://localhost:8086","database":"opennti_internal","basicAuth":false,"isDefault":false}' + echo Done configuring Grafana exit 0 } diff --git a/docker/telegraf/telegraf.toml b/docker/telegraf/telegraf.toml new file mode 100644 index 0000000..73b3cb7 --- /dev/null +++ b/docker/telegraf/telegraf.toml @@ -0,0 +1,339 @@ + +# Global tags can be specified here in key="value" format. +[global_tags] + # dc = "us-east-1" # will tag all metrics with dc=us-east-1 + role = "main" + +# Configuration for telegraf agent +[agent] + ## Default data collection interval for all inputs + interval = "10s" + ## Rounds collection interval to 'interval' + ## ie, if interval="10s" then always collect on :00, :10, :20, etc. + round_interval = true + + ## Telegraf will send metrics to outputs in batches of at most + ## metric_batch_size metrics. + ## This controls the size of writes that Telegraf sends to output plugins. + metric_batch_size = 1000 + + ## For failed writes, telegraf will cache metric_buffer_limit metrics for each + ## output, and will flush this buffer on a successful write. Oldest metrics + ## are dropped first when this buffer fills. + ## This buffer only fills when writes fail to output plugin(s). + metric_buffer_limit = 10000 + + ## Collection jitter is used to jitter the collection by a random amount. + ## Each plugin will sleep for a random time within jitter before collecting. + ## This can be used to avoid many plugins querying things like sysfs at the + ## same time, which can have a measurable effect on the system. + collection_jitter = "0s" + + ## Default flushing interval for all outputs. You shouldn't set this below + ## interval. Maximum flush_interval will be flush_interval + flush_jitter + flush_interval = "10s" + ## Jitter the flush interval by a random amount. This is primarily to avoid + ## large write spikes for users running a large number of telegraf instances. + ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s + flush_jitter = "0s" + + ## By default, precision will be set to the same timestamp order as the + ## collection interval, with the maximum being 1s. + ## Precision will NOT be used for service inputs, such as logparser and statsd. + ## Valid values are "ns", "us" (or "µs"), "ms", "s". + precision = "" + + ## Logging configuration: + ## Run telegraf with debug log messages. + debug = true + ## Run telegraf in quiet mode (error log messages only). + quiet = false + ## Specify the log file name. The empty string means to log to stderr. + logfile = "" + + ## Override default hostname, if empty use os.Hostname() + # hostname = "" + ## If set to true, do no set the "host" tag in the telegraf agent. + omit_hostname = false + +############################################################################### +# OUTPUT PLUGINS # +############################################################################### + +# Configuration for influxdb server to send metrics to +[[outputs.influxdb]] + ## The full HTTP or UDP endpoint URL for your InfluxDB instance. + ## Multiple urls can be specified as part of the same cluster, + ## this means that only ONE of the urls will be written to each interval. + # urls = ["udp://localhost:8089"] # UDP endpoint example + urls = ["http://localhost:8086"] # required + ## The target database for metrics (telegraf will create it if not exists). + database = "opennti_internal" # required + + ## Retention policy to write to. Empty string writes to the default rp. + retention_policy = "" + ## Write consistency (clusters only), can be: "any", "one", "quorum", "all" + write_consistency = "any" + + ## Write timeout (for the InfluxDB client), formatted as a string. + ## If not provided, will default to 5s. 0s means no timeout (not recommended). + timeout = "5s" + # username = "telegraf" + # password = "metricsmetricsmetricsmetrics" + + + +############################################################################### +# PROCESSOR PLUGINS # +############################################################################### + +# # Print all metrics that pass through this filter. +# [[processors.printer]] + + + +############################################################################### +# AGGREGATOR PLUGINS # +############################################################################### + +# # Keep the aggregate min/max of each metric passing through. +# [[aggregators.minmax]] +# ## General Aggregator Arguments: +# ## The period on which to flush & clear the aggregator. +# period = "30s" +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = false + + + +############################################################################### +# INPUT PLUGINS # +############################################################################### + +# Read metrics about cpu usage +[[inputs.cpu]] + ## Whether to report per-cpu stats or not + percpu = true + ## Whether to report total system cpu stats or not + totalcpu = true + ## If true, collect raw CPU time metrics. + collect_cpu_time = false + + +# Read metrics about disk usage by mount point +[[inputs.disk]] + ## By default, telegraf gather stats for all mountpoints. + ## Setting mountpoints will restrict the stats to the specified mountpoints. + # mount_points = ["/"] + + ## Ignore some mountpoints by filesystem type. For example (dev)tmpfs (usually + ## present on /run, /var/run, /dev/shm or /dev). + ignore_fs = ["tmpfs", "devtmpfs"] + +# Read metrics about disk IO by device +[[inputs.diskio]] + ## By default, telegraf will gather stats for all devices including + ## disk partitions. + ## Setting devices will restrict the stats to the specified devices. + # devices = ["sda", "sdb"] + ## Uncomment the following line if you need disk serial numbers. + # skip_serial_number = false + + +# Get kernel statistics from /proc/stat +[[inputs.kernel]] + # no configuration + + +# Read metrics about memory usage +[[inputs.mem]] + # no configuration + + +# Read metrics about swap memory usage +[[inputs.swap]] + # no configuration + + +# Read metrics about system load & uptime +[[inputs.system]] + # no configuration + + +# # Gather health check statuses from services registered in Consul +# [[inputs.consul]] +# ## Most of these values defaults to the one configured on a Consul's agent level. +# ## Optional Consul server address (default: "localhost") +# # address = "localhost" +# ## Optional URI scheme for the Consul server (default: "http") +# # scheme = "http" +# ## Optional ACL token used in every request (default: "") +# # token = "" +# ## Optional username used for request HTTP Basic Authentication (default: "") +# # username = "" +# ## Optional password used for HTTP Basic Authentication (default: "") +# # password = "" +# ## Optional data centre to query the health checks from (default: "") +# # datacentre = "" + + +# # Read metrics about docker containers +# [[inputs.docker]] +# ## Docker Endpoint +# ## To use TCP, set endpoint = "tcp://[ip]:[port]" +# ## To use environment variables (ie, docker-machine), set endpoint = "ENV" +# endpoint = "unix:///var/run/docker.sock" +# ## Only collect metrics for these containers, collect all if empty +# container_names = [] +# ## Timeout for docker list, info, and stats commands +# timeout = "5s" +# +# ## Whether to report for each container per-device blkio (8:0, 8:1...) and +# ## network (eth0, eth1, ...) stats or not +# perdevice = true +# ## Whether to report for each container total blkio and network stats or not +# total = false +# + + + + +# # Read stats about given file(s) +# [[inputs.filestat]] +# ## Files to gather stats about. +# ## These accept standard unix glob matching rules, but with the addition of +# ## ** as a "super asterisk". ie: +# ## "/var/log/**.log" -> recursively find all .log files in /var/log +# ## "/var/log/*/*.log" -> find all .log files with a parent dir in /var/log +# ## "/var/log/apache.log" -> just tail the apache log file +# ## +# ## See https://github.com/gobwas/glob for more examples +# ## +# files = ["/var/log/**.log"] +# ## If true, read the entire file and calculate an md5 checksum. +# md5 = false + + + + +# # Read InfluxDB-formatted JSON metrics from one or more HTTP endpoints +# [[inputs.influxdb]] +# ## Works with InfluxDB debug endpoints out of the box, +# ## but other services can use this format too. +# ## See the influxdb plugin's README for more details. +# +# ## Multiple URLs from which to read InfluxDB-formatted JSON +# ## Default is "http://localhost:8086/debug/vars". +# urls = [ +# "http://localhost:8086/debug/vars" +# ] +# +# ## http request & header timeout +# timeout = "5s" + + +# # Gather packets and bytes throughput from iptables +# [[inputs.iptables]] +# ## iptables require root access on most systems. +# ## Setting 'use_sudo' to true will make use of sudo to run iptables. +# ## Users must configure sudo to allow telegraf user to run iptables with no password. +# ## iptables can be restricted to only list command "iptables -nvL" +# use_sudo = false +# ## defines the table to monitor: +# table = "filter" +# ## defines the chains to monitor: +# chains = [ "INPUT" ] + + +# # Get kernel statistics from /proc/vmstat +# [[inputs.kernel_vmstat]] +# # no configuration + + +# # Read metrics about network interface usage +[[inputs.net]] + ## By default, telegraf gathers stats from any up interface (excluding loopback) + ## Setting interfaces will tell it to gather these explicit interfaces, + ## regardless of status. + ## + # interfaces = ["eth0"] + + +# # TCP or UDP 'ping' given url and collect response time in seconds +# [[inputs.net_response]] +# ## Protocol, must be "tcp" or "udp" +# protocol = "tcp" +# ## Server address (default localhost) +# address = "github.com:80" +# ## Set timeout +# timeout = "1s" +# +# ## Optional string sent to the server +# # send = "ssh" +# ## Optional expected string in answer +# # expect = "ssh" +# ## Set read timeout (only used if expecting a response) +# read_timeout = "1s" + + +# # Read TCP metrics such as established, time wait and sockets counts. +# [[inputs.netstat]] +# # no configuration + + +# # Read Nginx's basic status information (ngx_http_stub_status_module) +# [[inputs.nginx]] +# ## An array of Nginx stub_status URI to gather stats. +# urls = ["http://localhost/status"] + + +# # Collect kernel snmp counters and network interface statistics +# [[inputs.nstat]] +# ## file paths for proc files. If empty default paths will be used: +# ## /proc/net/netstat, /proc/net/snmp, /proc/net/snmp6 +# ## These can also be overridden with env variables, see README. +# proc_net_netstat = "/proc/net/netstat" +# proc_net_snmp = "/proc/net/snmp" +# proc_net_snmp6 = "/proc/net/snmp6" +# ## dump metrics with 0 values too +# dump_zeros = true + + + +# # Ping given url(s) and return statistics +# [[inputs.ping]] +# ## NOTE: this plugin forks the ping command. You may need to set capabilities +# ## via setcap cap_net_raw+p /bin/ping +# # +# ## urls to ping +# urls = ["www.google.com"] # required +# ## number of pings to send per collection (ping -c ) +# # count = 1 +# ## interval, in s, at which to ping. 0 == default (ping -i ) +# # ping_interval = 1.0 +# ## per-ping timeout, in s. 0 == no timeout (ping -W ) +# # timeout = 1.0 +# ## interface to send ping from (ping -I ) +# # interface = "" + + +# # Monitor process cpu and memory usage +# [[inputs.procstat]] +# ## Must specify one of: pid_file, exe, or pattern +# ## PID file to monitor process +# pid_file = "/var/run/nginx.pid" +# ## executable name (ie, pgrep ) +# # exe = "nginx" +# ## pattern as argument for pgrep (ie, pgrep -f ) +# # pattern = "nginx" +# ## user as argument for pgrep (ie, pgrep -u ) +# # user = "nginx" +# +# ## override for process_name +# ## This is optional; default is sourced from /proc//status +# # process_name = "bar" +# ## Field name prefix +# prefix = "" +# ## comment this out if you want raw cpu_time stats +# fielddrop = ["cpu_time_*"] diff --git a/plugins/input-jti/Dockerfile b/plugins/input-jti/Dockerfile index 23e1317..4bd6952 100644 --- a/plugins/input-jti/Dockerfile +++ b/plugins/input-jti/Dockerfile @@ -2,6 +2,7 @@ FROM fluent/fluentd:v0.12.29 MAINTAINER Damien Garros ENV FLUENTD_JUNIPER_VERSION 0.3.0 +ARG TELEGRAF_VERSION=1.1.2 USER root WORKDIR /home/fluent @@ -45,6 +46,26 @@ COPY plugins /fluentd/plugins RUN echo "fluent ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers +############################# +## Install Telegraf +############################# +RUN apk add --no-cache ca-certificates openssl wget && \ + update-ca-certificates + +RUN wget -q https://dl.influxdata.com/telegraf/releases/telegraf-${TELEGRAF_VERSION}-static_linux_amd64.tar.gz && \ + mkdir -p /usr/src /etc/telegraf && \ + tar -C /usr/src -xzf telegraf-${TELEGRAF_VERSION}-static_linux_amd64.tar.gz && \ + mv /usr/src/telegraf*/telegraf.conf /etc/telegraf/ && \ + chmod +x /usr/src/telegraf*/* && \ + cp -a /usr/src/telegraf*/* /usr/bin/ && \ + rm -rf *.tar.gz* /usr/src /root/.gnupg + +COPY telegraf.toml /home/fluent/telegraf.toml +RUN touch /var/log/telegraf-monitoring.log &&\ + chmod 777 /var/log/telegraf-monitoring.log + + + USER fluent EXPOSE 24284 diff --git a/plugins/input-jti/fluentd-alpine.start.sh b/plugins/input-jti/fluentd-alpine.start.sh index 32ab016..3c9e8ce 100644 --- a/plugins/input-jti/fluentd-alpine.start.sh +++ b/plugins/input-jti/fluentd-alpine.start.sh @@ -2,6 +2,8 @@ # `/sbin/setuser memcache` runs the given command as the user `memcache`. # If you omit that part, the command will be run as root. +telegraf --config /home/fluent/telegraf.toml >>/var/log/telegraf-monitoring.log 2>&1 & + envtpl --keep-template /fluentd/etc/fluent.conf -o /tmp/fluent.conf fluentd -c /tmp/fluent.conf -p /fluentd/plugins $FLUENTD_OPT diff --git a/plugins/input-jti/telegraf.toml b/plugins/input-jti/telegraf.toml new file mode 100644 index 0000000..60ea5ab --- /dev/null +++ b/plugins/input-jti/telegraf.toml @@ -0,0 +1,339 @@ + +# Global tags can be specified here in key="value" format. +[global_tags] + # dc = "us-east-1" # will tag all metrics with dc=us-east-1 + role = "input-jti" + +# Configuration for telegraf agent +[agent] + ## Default data collection interval for all inputs + interval = "10s" + ## Rounds collection interval to 'interval' + ## ie, if interval="10s" then always collect on :00, :10, :20, etc. + round_interval = true + + ## Telegraf will send metrics to outputs in batches of at most + ## metric_batch_size metrics. + ## This controls the size of writes that Telegraf sends to output plugins. + metric_batch_size = 1000 + + ## For failed writes, telegraf will cache metric_buffer_limit metrics for each + ## output, and will flush this buffer on a successful write. Oldest metrics + ## are dropped first when this buffer fills. + ## This buffer only fills when writes fail to output plugin(s). + metric_buffer_limit = 10000 + + ## Collection jitter is used to jitter the collection by a random amount. + ## Each plugin will sleep for a random time within jitter before collecting. + ## This can be used to avoid many plugins querying things like sysfs at the + ## same time, which can have a measurable effect on the system. + collection_jitter = "0s" + + ## Default flushing interval for all outputs. You shouldn't set this below + ## interval. Maximum flush_interval will be flush_interval + flush_jitter + flush_interval = "10s" + ## Jitter the flush interval by a random amount. This is primarily to avoid + ## large write spikes for users running a large number of telegraf instances. + ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s + flush_jitter = "0s" + + ## By default, precision will be set to the same timestamp order as the + ## collection interval, with the maximum being 1s. + ## Precision will NOT be used for service inputs, such as logparser and statsd. + ## Valid values are "ns", "us" (or "µs"), "ms", "s". + precision = "" + + ## Logging configuration: + ## Run telegraf with debug log messages. + debug = true + ## Run telegraf in quiet mode (error log messages only). + quiet = false + ## Specify the log file name. The empty string means to log to stderr. + logfile = "" + + ## Override default hostname, if empty use os.Hostname() + # hostname = "" + ## If set to true, do no set the "host" tag in the telegraf agent. + omit_hostname = false + +############################################################################### +# OUTPUT PLUGINS # +############################################################################### + +# Configuration for influxdb server to send metrics to +[[outputs.influxdb]] + ## The full HTTP or UDP endpoint URL for your InfluxDB instance. + ## Multiple urls can be specified as part of the same cluster, + ## this means that only ONE of the urls will be written to each interval. + # urls = ["udp://localhost:8089"] # UDP endpoint example + urls = ["http://opennti:8086"] # required + ## The target database for metrics (telegraf will create it if not exists). + database = "opennti_internal" # required + + ## Retention policy to write to. Empty string writes to the default rp. + retention_policy = "" + ## Write consistency (clusters only), can be: "any", "one", "quorum", "all" + write_consistency = "any" + + ## Write timeout (for the InfluxDB client), formatted as a string. + ## If not provided, will default to 5s. 0s means no timeout (not recommended). + timeout = "5s" + # username = "telegraf" + # password = "metricsmetricsmetricsmetrics" + + + +############################################################################### +# PROCESSOR PLUGINS # +############################################################################### + +# # Print all metrics that pass through this filter. +# [[processors.printer]] + + + +############################################################################### +# AGGREGATOR PLUGINS # +############################################################################### + +# # Keep the aggregate min/max of each metric passing through. +# [[aggregators.minmax]] +# ## General Aggregator Arguments: +# ## The period on which to flush & clear the aggregator. +# period = "30s" +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = false + + + +############################################################################### +# INPUT PLUGINS # +############################################################################### + +# Read metrics about cpu usage +[[inputs.cpu]] + ## Whether to report per-cpu stats or not + percpu = true + ## Whether to report total system cpu stats or not + totalcpu = true + ## If true, collect raw CPU time metrics. + collect_cpu_time = false + + +# Read metrics about disk usage by mount point +[[inputs.disk]] + ## By default, telegraf gather stats for all mountpoints. + ## Setting mountpoints will restrict the stats to the specified mountpoints. + # mount_points = ["/"] + + ## Ignore some mountpoints by filesystem type. For example (dev)tmpfs (usually + ## present on /run, /var/run, /dev/shm or /dev). + ignore_fs = ["tmpfs", "devtmpfs"] + +# Read metrics about disk IO by device +[[inputs.diskio]] + ## By default, telegraf will gather stats for all devices including + ## disk partitions. + ## Setting devices will restrict the stats to the specified devices. + # devices = ["sda", "sdb"] + ## Uncomment the following line if you need disk serial numbers. + # skip_serial_number = false + + +# Get kernel statistics from /proc/stat +[[inputs.kernel]] + # no configuration + + +# Read metrics about memory usage +[[inputs.mem]] + # no configuration + + +# Read metrics about swap memory usage +[[inputs.swap]] + # no configuration + + +# Read metrics about system load & uptime +[[inputs.system]] + # no configuration + + +# # Gather health check statuses from services registered in Consul +# [[inputs.consul]] +# ## Most of these values defaults to the one configured on a Consul's agent level. +# ## Optional Consul server address (default: "localhost") +# # address = "localhost" +# ## Optional URI scheme for the Consul server (default: "http") +# # scheme = "http" +# ## Optional ACL token used in every request (default: "") +# # token = "" +# ## Optional username used for request HTTP Basic Authentication (default: "") +# # username = "" +# ## Optional password used for HTTP Basic Authentication (default: "") +# # password = "" +# ## Optional data centre to query the health checks from (default: "") +# # datacentre = "" + + +# # Read metrics about docker containers +# [[inputs.docker]] +# ## Docker Endpoint +# ## To use TCP, set endpoint = "tcp://[ip]:[port]" +# ## To use environment variables (ie, docker-machine), set endpoint = "ENV" +# endpoint = "unix:///var/run/docker.sock" +# ## Only collect metrics for these containers, collect all if empty +# container_names = [] +# ## Timeout for docker list, info, and stats commands +# timeout = "5s" +# +# ## Whether to report for each container per-device blkio (8:0, 8:1...) and +# ## network (eth0, eth1, ...) stats or not +# perdevice = true +# ## Whether to report for each container total blkio and network stats or not +# total = false +# + + + + +# # Read stats about given file(s) +# [[inputs.filestat]] +# ## Files to gather stats about. +# ## These accept standard unix glob matching rules, but with the addition of +# ## ** as a "super asterisk". ie: +# ## "/var/log/**.log" -> recursively find all .log files in /var/log +# ## "/var/log/*/*.log" -> find all .log files with a parent dir in /var/log +# ## "/var/log/apache.log" -> just tail the apache log file +# ## +# ## See https://github.com/gobwas/glob for more examples +# ## +# files = ["/var/log/**.log"] +# ## If true, read the entire file and calculate an md5 checksum. +# md5 = false + + + + +# # Read InfluxDB-formatted JSON metrics from one or more HTTP endpoints +# [[inputs.influxdb]] +# ## Works with InfluxDB debug endpoints out of the box, +# ## but other services can use this format too. +# ## See the influxdb plugin's README for more details. +# +# ## Multiple URLs from which to read InfluxDB-formatted JSON +# ## Default is "http://localhost:8086/debug/vars". +# urls = [ +# "http://localhost:8086/debug/vars" +# ] +# +# ## http request & header timeout +# timeout = "5s" + + +# # Gather packets and bytes throughput from iptables +# [[inputs.iptables]] +# ## iptables require root access on most systems. +# ## Setting 'use_sudo' to true will make use of sudo to run iptables. +# ## Users must configure sudo to allow telegraf user to run iptables with no password. +# ## iptables can be restricted to only list command "iptables -nvL" +# use_sudo = false +# ## defines the table to monitor: +# table = "filter" +# ## defines the chains to monitor: +# chains = [ "INPUT" ] + + +# # Get kernel statistics from /proc/vmstat +# [[inputs.kernel_vmstat]] +# # no configuration + + +# # Read metrics about network interface usage +[[inputs.net]] + ## By default, telegraf gathers stats from any up interface (excluding loopback) + ## Setting interfaces will tell it to gather these explicit interfaces, + ## regardless of status. + ## + interfaces = ["eth0"] + + +# # TCP or UDP 'ping' given url and collect response time in seconds +# [[inputs.net_response]] +# ## Protocol, must be "tcp" or "udp" +# protocol = "tcp" +# ## Server address (default localhost) +# address = "github.com:80" +# ## Set timeout +# timeout = "1s" +# +# ## Optional string sent to the server +# # send = "ssh" +# ## Optional expected string in answer +# # expect = "ssh" +# ## Set read timeout (only used if expecting a response) +# read_timeout = "1s" + + +# # Read TCP metrics such as established, time wait and sockets counts. +# [[inputs.netstat]] +# # no configuration + + +# # Read Nginx's basic status information (ngx_http_stub_status_module) +# [[inputs.nginx]] +# ## An array of Nginx stub_status URI to gather stats. +# urls = ["http://localhost/status"] + + +# # Collect kernel snmp counters and network interface statistics +# [[inputs.nstat]] +# ## file paths for proc files. If empty default paths will be used: +# ## /proc/net/netstat, /proc/net/snmp, /proc/net/snmp6 +# ## These can also be overridden with env variables, see README. +# proc_net_netstat = "/proc/net/netstat" +# proc_net_snmp = "/proc/net/snmp" +# proc_net_snmp6 = "/proc/net/snmp6" +# ## dump metrics with 0 values too +# dump_zeros = true + + + +# # Ping given url(s) and return statistics +# [[inputs.ping]] +# ## NOTE: this plugin forks the ping command. You may need to set capabilities +# ## via setcap cap_net_raw+p /bin/ping +# # +# ## urls to ping +# urls = ["www.google.com"] # required +# ## number of pings to send per collection (ping -c ) +# # count = 1 +# ## interval, in s, at which to ping. 0 == default (ping -i ) +# # ping_interval = 1.0 +# ## per-ping timeout, in s. 0 == no timeout (ping -W ) +# # timeout = 1.0 +# ## interface to send ping from (ping -I ) +# # interface = "" + + +# # Monitor process cpu and memory usage +# [[inputs.procstat]] +# ## Must specify one of: pid_file, exe, or pattern +# ## PID file to monitor process +# pid_file = "/var/run/nginx.pid" +# ## executable name (ie, pgrep ) +# # exe = "nginx" +# ## pattern as argument for pgrep (ie, pgrep -f ) +# # pattern = "nginx" +# ## user as argument for pgrep (ie, pgrep -u ) +# # user = "nginx" +# +# ## override for process_name +# ## This is optional; default is sourced from /proc//status +# # process_name = "bar" +# ## Field name prefix +# prefix = "" +# ## comment this out if you want raw cpu_time stats +# fielddrop = ["cpu_time_*"] diff --git a/plugins/input-syslog/Dockerfile b/plugins/input-syslog/Dockerfile index 086ef12..2551296 100644 --- a/plugins/input-syslog/Dockerfile +++ b/plugins/input-syslog/Dockerfile @@ -2,6 +2,7 @@ FROM fluent/fluentd:v0.12.24 MAINTAINER Damien Garros ENV FLUENTD_JUNIPER_VERSION 0.2.11 +ARG TELEGRAF_VERSION=1.1.2 USER root WORKDIR /home/fluent @@ -28,6 +29,25 @@ RUN apk --no-cache --update add \ apk del build-base ruby-dev && \ rm -rf /tmp/* /var/tmp/* /var/cache/apk/* + +############################# +## Install Telegraf +############################# +RUN apk add --no-cache ca-certificates openssl wget && \ + update-ca-certificates + +RUN wget -q https://dl.influxdata.com/telegraf/releases/telegraf-${TELEGRAF_VERSION}-static_linux_amd64.tar.gz && \ + mkdir -p /usr/src /etc/telegraf && \ + tar -C /usr/src -xzf telegraf-${TELEGRAF_VERSION}-static_linux_amd64.tar.gz && \ + mv /usr/src/telegraf*/telegraf.conf /etc/telegraf/ && \ + chmod +x /usr/src/telegraf*/* && \ + cp -a /usr/src/telegraf*/* /usr/bin/ && \ + rm -rf *.tar.gz* /usr/src /root/.gnupg + +COPY telegraf.toml /home/fluent/telegraf.toml +RUN touch /var/log/telegraf-monitoring.log &&\ + chmod 777 /var/log/telegraf-monitoring.log + # Copy Start script to generate configuration dynamically ADD fluentd-alpine.start.sh fluentd-alpine.start.sh RUN chown -R fluent:fluent fluentd-alpine.start.sh diff --git a/plugins/input-syslog/fluentd-alpine.start.sh b/plugins/input-syslog/fluentd-alpine.start.sh index 32ab016..3c9e8ce 100644 --- a/plugins/input-syslog/fluentd-alpine.start.sh +++ b/plugins/input-syslog/fluentd-alpine.start.sh @@ -2,6 +2,8 @@ # `/sbin/setuser memcache` runs the given command as the user `memcache`. # If you omit that part, the command will be run as root. +telegraf --config /home/fluent/telegraf.toml >>/var/log/telegraf-monitoring.log 2>&1 & + envtpl --keep-template /fluentd/etc/fluent.conf -o /tmp/fluent.conf fluentd -c /tmp/fluent.conf -p /fluentd/plugins $FLUENTD_OPT diff --git a/plugins/input-syslog/telegraf.toml b/plugins/input-syslog/telegraf.toml new file mode 100644 index 0000000..fd75300 --- /dev/null +++ b/plugins/input-syslog/telegraf.toml @@ -0,0 +1,339 @@ + +# Global tags can be specified here in key="value" format. +[global_tags] + # dc = "us-east-1" # will tag all metrics with dc=us-east-1 + role = "input-syslog" + +# Configuration for telegraf agent +[agent] + ## Default data collection interval for all inputs + interval = "10s" + ## Rounds collection interval to 'interval' + ## ie, if interval="10s" then always collect on :00, :10, :20, etc. + round_interval = true + + ## Telegraf will send metrics to outputs in batches of at most + ## metric_batch_size metrics. + ## This controls the size of writes that Telegraf sends to output plugins. + metric_batch_size = 1000 + + ## For failed writes, telegraf will cache metric_buffer_limit metrics for each + ## output, and will flush this buffer on a successful write. Oldest metrics + ## are dropped first when this buffer fills. + ## This buffer only fills when writes fail to output plugin(s). + metric_buffer_limit = 10000 + + ## Collection jitter is used to jitter the collection by a random amount. + ## Each plugin will sleep for a random time within jitter before collecting. + ## This can be used to avoid many plugins querying things like sysfs at the + ## same time, which can have a measurable effect on the system. + collection_jitter = "0s" + + ## Default flushing interval for all outputs. You shouldn't set this below + ## interval. Maximum flush_interval will be flush_interval + flush_jitter + flush_interval = "10s" + ## Jitter the flush interval by a random amount. This is primarily to avoid + ## large write spikes for users running a large number of telegraf instances. + ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s + flush_jitter = "0s" + + ## By default, precision will be set to the same timestamp order as the + ## collection interval, with the maximum being 1s. + ## Precision will NOT be used for service inputs, such as logparser and statsd. + ## Valid values are "ns", "us" (or "µs"), "ms", "s". + precision = "" + + ## Logging configuration: + ## Run telegraf with debug log messages. + debug = true + ## Run telegraf in quiet mode (error log messages only). + quiet = false + ## Specify the log file name. The empty string means to log to stderr. + logfile = "" + + ## Override default hostname, if empty use os.Hostname() + # hostname = "" + ## If set to true, do no set the "host" tag in the telegraf agent. + omit_hostname = false + +############################################################################### +# OUTPUT PLUGINS # +############################################################################### + +# Configuration for influxdb server to send metrics to +[[outputs.influxdb]] + ## The full HTTP or UDP endpoint URL for your InfluxDB instance. + ## Multiple urls can be specified as part of the same cluster, + ## this means that only ONE of the urls will be written to each interval. + # urls = ["udp://localhost:8089"] # UDP endpoint example + urls = ["http://opennti:8086"] # required + ## The target database for metrics (telegraf will create it if not exists). + database = "opennti_internal" # required + + ## Retention policy to write to. Empty string writes to the default rp. + retention_policy = "" + ## Write consistency (cluste`rs only), can be: "any", "one", "quorum", "all" + write_consistency = "any" + + ## Write timeout (for the InfluxDB client), formatted as a string. + ## If not provided, will default to 5s. 0s means no timeout (not recommended). + timeout = "5s" + # username = "telegraf" + # password = "metricsmetricsmetricsmetrics" + + + +############################################################################### +# PROCESSOR PLUGINS # +############################################################################### + +# # Print all metrics that pass through this filter. +# [[processors.printer]] + + + +############################################################################### +# AGGREGATOR PLUGINS # +############################################################################### + +# # Keep the aggregate min/max of each metric passing through. +# [[aggregators.minmax]] +# ## General Aggregator Arguments: +# ## The period on which to flush & clear the aggregator. +# period = "30s" +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = false + + + +############################################################################### +# INPUT PLUGINS # +############################################################################### + +# Read metrics about cpu usage +[[inputs.cpu]] + ## Whether to report per-cpu stats or not + percpu = true + ## Whether to report total system cpu stats or not + totalcpu = true + ## If true, collect raw CPU time metrics. + collect_cpu_time = false + + +# Read metrics about disk usage by mount point +[[inputs.disk]] + ## By default, telegraf gather stats for all mountpoints. + ## Setting mountpoints will restrict the stats to the specified mountpoints. + # mount_points = ["/"] + + ## Ignore some mountpoints by filesystem type. For example (dev)tmpfs (usually + ## present on /run, /var/run, /dev/shm or /dev). + ignore_fs = ["tmpfs", "devtmpfs"] + +# Read metrics about disk IO by device +[[inputs.diskio]] + ## By default, telegraf will gather stats for all devices including + ## disk partitions. + ## Setting devices will restrict the stats to the specified devices. + # devices = ["sda", "sdb"] + ## Uncomment the following line if you need disk serial numbers. + # skip_serial_number = false + + +# Get kernel statistics from /proc/stat +[[inputs.kernel]] + # no configuration + + +# Read metrics about memory usage +[[inputs.mem]] + # no configuration + + +# Read metrics about swap memory usage +[[inputs.swap]] + # no configuration + + +# Read metrics about system load & uptime +[[inputs.system]] + # no configuration + + +# # Gather health check statuses from services registered in Consul +# [[inputs.consul]] +# ## Most of these values defaults to the one configured on a Consul's agent level. +# ## Optional Consul server address (default: "localhost") +# # address = "localhost" +# ## Optional URI scheme for the Consul server (default: "http") +# # scheme = "http" +# ## Optional ACL token used in every request (default: "") +# # token = "" +# ## Optional username used for request HTTP Basic Authentication (default: "") +# # username = "" +# ## Optional password used for HTTP Basic Authentication (default: "") +# # password = "" +# ## Optional data centre to query the health checks from (default: "") +# # datacentre = "" + + +# # Read metrics about docker containers +# [[inputs.docker]] +# ## Docker Endpoint +# ## To use TCP, set endpoint = "tcp://[ip]:[port]" +# ## To use environment variables (ie, docker-machine), set endpoint = "ENV" +# endpoint = "unix:///var/run/docker.sock" +# ## Only collect metrics for these containers, collect all if empty +# container_names = [] +# ## Timeout for docker list, info, and stats commands +# timeout = "5s" +# +# ## Whether to report for each container per-device blkio (8:0, 8:1...) and +# ## network (eth0, eth1, ...) stats or not +# perdevice = true +# ## Whether to report for each container total blkio and network stats or not +# total = false +# + + + + +# # Read stats about given file(s) +# [[inputs.filestat]] +# ## Files to gather stats about. +# ## These accept standard unix glob matching rules, but with the addition of +# ## ** as a "super asterisk". ie: +# ## "/var/log/**.log" -> recursively find all .log files in /var/log +# ## "/var/log/*/*.log" -> find all .log files with a parent dir in /var/log +# ## "/var/log/apache.log" -> just tail the apache log file +# ## +# ## See https://github.com/gobwas/glob for more examples +# ## +# files = ["/var/log/**.log"] +# ## If true, read the entire file and calculate an md5 checksum. +# md5 = false + + + + +# # Read InfluxDB-formatted JSON metrics from one or more HTTP endpoints +# [[inputs.influxdb]] +# ## Works with InfluxDB debug endpoints out of the box, +# ## but other services can use this format too. +# ## See the influxdb plugin's README for more details. +# +# ## Multiple URLs from which to read InfluxDB-formatted JSON +# ## Default is "http://localhost:8086/debug/vars". +# urls = [ +# "http://localhost:8086/debug/vars" +# ] +# +# ## http request & header timeout +# timeout = "5s" + + +# # Gather packets and bytes throughput from iptables +# [[inputs.iptables]] +# ## iptables require root access on most systems. +# ## Setting 'use_sudo' to true will make use of sudo to run iptables. +# ## Users must configure sudo to allow telegraf user to run iptables with no password. +# ## iptables can be restricted to only list command "iptables -nvL" +# use_sudo = false +# ## defines the table to monitor: +# table = "filter" +# ## defines the chains to monitor: +# chains = [ "INPUT" ] + + +# # Get kernel statistics from /proc/vmstat +# [[inputs.kernel_vmstat]] +# # no configuration + + +# # Read metrics about network interface usage +[[inputs.net]] + ## By default, telegraf gathers stats from any up interface (excluding loopback) + ## Setting interfaces will tell it to gather these explicit interfaces, + ## regardless of status. + ## + interfaces = ["eth0"] + + +# # TCP or UDP 'ping' given url and collect response time in seconds +# [[inputs.net_response]] +# ## Protocol, must be "tcp" or "udp" +# protocol = "tcp" +# ## Server address (default localhost) +# address = "github.com:80" +# ## Set timeout +# timeout = "1s" +# +# ## Optional string sent to the server +# # send = "ssh" +# ## Optional expected string in answer +# # expect = "ssh" +# ## Set read timeout (only used if expecting a response) +# read_timeout = "1s" + + +# # Read TCP metrics such as established, time wait and sockets counts. +# [[inputs.netstat]] +# # no configuration + + +# # Read Nginx's basic status information (ngx_http_stub_status_module) +# [[inputs.nginx]] +# ## An array of Nginx stub_status URI to gather stats. +# urls = ["http://localhost/status"] + + +# # Collect kernel snmp counters and network interface statistics +# [[inputs.nstat]] +# ## file paths for proc files. If empty default paths will be used: +# ## /proc/net/netstat, /proc/net/snmp, /proc/net/snmp6 +# ## These can also be overridden with env variables, see README. +# proc_net_netstat = "/proc/net/netstat" +# proc_net_snmp = "/proc/net/snmp" +# proc_net_snmp6 = "/proc/net/snmp6" +# ## dump metrics with 0 values too +# dump_zeros = true + + + +# # Ping given url(s) and return statistics +# [[inputs.ping]] +# ## NOTE: this plugin forks the ping command. You may need to set capabilities +# ## via setcap cap_net_raw+p /bin/ping +# # +# ## urls to ping +# urls = ["www.google.com"] # required +# ## number of pings to send per collection (ping -c ) +# # count = 1 +# ## interval, in s, at which to ping. 0 == default (ping -i ) +# # ping_interval = 1.0 +# ## per-ping timeout, in s. 0 == no timeout (ping -W ) +# # timeout = 1.0 +# ## interface to send ping from (ping -I ) +# # interface = "" + + +# # Monitor process cpu and memory usage +# [[inputs.procstat]] +# ## Must specify one of: pid_file, exe, or pattern +# ## PID file to monitor process +# pid_file = "/var/run/nginx.pid" +# ## executable name (ie, pgrep ) +# # exe = "nginx" +# ## pattern as argument for pgrep (ie, pgrep -f ) +# # pattern = "nginx" +# ## user as argument for pgrep (ie, pgrep -u ) +# # user = "nginx" +# +# ## override for process_name +# ## This is optional; default is sourced from /proc//status +# # process_name = "bar" +# ## Field name prefix +# prefix = "" +# ## comment this out if you want raw cpu_time stats +# fielddrop = ["cpu_time_*"]