telegraf is a plugin-driven server agent for collecting and reporting metrics. 在看了Telegraf的介绍之后,它能很方便的把一些性能指标收集之后输入到各类数据库、消息队列等(当然我肯定首先influxdb)。再看了一下已实现的input plguin有几十个,已经覆盖了常用的各类服务端软件(包括公司使用到的haproxy varnish mongodb redis docker),而且还能对DNS和Ping检测,在此对于监控的数据收集已经是甚于满足了,而且其配置简单,下面是机器性能与mongodb的性能收集配置:
[[inputs.cpu]]
## 机器CPU性能收集配置
## 是否记录每个CPU的性能
percpu = true
## 是否记录整个系统的CPU性能
totalcpu = true
## 是否需要收集原始的CPU时间指标(一般我们都是不需要)
collect_cpu_time = false
## 下面的配置都是telegraf自带的配置就有,因此不详细说明
# Read metrics about disk usage by mount point
[[inputs.disk]]
...
# Read metrics about disk IO by device
[[inputs.diskio]]
...
# Get kernel statistics from /proc/stat
[[inputs.kernel]]
...
# Read metrics about memory usage
[[inputs.mem]]
...
# Get the number of processes and group them by status
[[inputs.processes]]
...
# Read metrics about swap memory usage
[[inputs.swap]]
...
# Read metrics about system load & uptime
[[inputs.system]]
...
# Read metrics from one or many MongoDB servers
[[inputs.mongodb]]
servers = ["mongodb://mongodb://username:password@10.XX.XX.XX:27101/mydatabase?authSource=admin"]
# 是否要收集各db的性能分析数据
stream
|from()
.measurement('cpu')
// create a new field called 'used' which inverts the idle cpu.
|eval(lambda: 100.0 - "usage_idle")
.as('used')
|groupBy('service', 'datacenter')
|window()
.period(1m)
.every(1m)
// calculate the 95th percentile of the used cpu.
|percentile('used', 95.0)
|eval(lambda: sigma("percentile"))
.as('sigma')
.keep('percentile', 'sigma')
|alert()
.id('{{ .Name }}/{{ index .Tags "service" }}/{{ index .Tags "datacenter"}}')
.message('{{ .ID }} is {{ .Level }} cpu-95th:{{ index .Fields "percentile" }}')
// Compare values to running mean and standard deviation
.warn(lambda: "sigma" > 2.5)
.crit(lambda: "sigma" > 3.0)
.log('/tmp/alerts.log')
// Post data to custom endpoint
.post('https://alerthandler.example.com')
// Execute custom alert handler script
.exec('/bin/custom_alert_handler.sh')
// Send alerts to slack
.slack()
.channel('#alerts')
// Sends alerts to PagerDuty
.pagerDuty()
// Send alerts to VictorOps
.victorOps()
myinfluxdb:
# the inflxudb host
host: "127.0.0.1"
# the influxdb port, default is 8086, [optional]
port: 8086
# the influxdb protocol, default is "http", [optional]
protocol: http
# the user for influxdb, [optional]
# user: user
# the password for influxdb, [optional]
# password: password
measurement:
login:
-
# pass the check
# [optional]
pass: false
# day filter, Monday:1, ... Sunday:7
# [optional]
# eg: "1-7" means Monday to Sunday
# eg: ["1-3", "6-7"] means Monday to Wednesday
# and Saturday to Sunday
day: "1-7"
# time filter
# [optional]
# eg: "00:00-12:00", or ["00:00-09:00", "13:00-18:00"]
time: "00:00-24:00"
# when the check is fail, the warn text
text: The count of successful login is abnormal
# the influxdb where conditions
# [optional]
where:
- result = success
# the start time of influxdb query
# [optional]
start: "-5m"
# the ene time of influxdb query, default is now()
# [optional]
# end: "now()"
# the influxdb function for data
# [optional]
func:
- count(account)
# check for each series of the result,
# if the check return true,
# the warn event will be emited
check: count < 100
-
day: ["1", "2", "3", "4", "5", "6", "7"]
time: ["00:00-12:00", "12:00-24:00"]
text: The count of failed login is abnormal
where:
- result = fail
func:
- count(account)
check: count > 10
-
text: The count of failed login(group by account's type) is abnormal
group: type
where:
- result = fail
func:
- count(account)
check:
- count > 1 && type === 'vip'
- count > 1 && type === 'normal'
-
day: "1-2"
text: The check is pass
pass: true
check: type === 'test'