add basic infrastructure for writing a checker
These are the first components to interact with the database in a predefined way to write a checker instance. CheckExec serves as an example implementing the Executor interface.
This commit is contained in:
parent
e2b479c34f
commit
0639a504eb
|
@ -0,0 +1,38 @@
|
||||||
|
package monzero
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"os/exec"
|
||||||
|
"syscall"
|
||||||
|
)
|
||||||
|
|
||||||
|
// CheckExec runs a command line string.
|
||||||
|
// The output is recorded completely and returned as one message.
|
||||||
|
func CheckExec(check Check, ctx context.Context) CheckResult {
|
||||||
|
result := CheckResult{}
|
||||||
|
|
||||||
|
cmd := exec.CommandContext(ctx, check.Command[0], check.Command[1:]...)
|
||||||
|
output := bytes.NewBuffer([]byte{})
|
||||||
|
cmd.Stdout = output
|
||||||
|
cmd.Stderr = output
|
||||||
|
err := cmd.Run()
|
||||||
|
if err != nil {
|
||||||
|
if cmd.ProcessState == nil {
|
||||||
|
result.Message = fmt.Sprintf("unknown error when running command: %w", err)
|
||||||
|
result.ExitCode = 3
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
status, ok := cmd.ProcessState.Sys().(syscall.WaitStatus)
|
||||||
|
if !ok {
|
||||||
|
result.Message = fmt.Sprintf("error running check: %w", err)
|
||||||
|
result.ExitCode = 2
|
||||||
|
} else {
|
||||||
|
result.ExitCode = status.ExitStatus()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
result.Message = output.String()
|
||||||
|
return result
|
||||||
|
}
|
|
@ -0,0 +1,150 @@
|
||||||
|
package monzero
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"database/sql"
|
||||||
|
"fmt"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
ErrNoCheck = fmt.Errorf("no check found to run")
|
||||||
|
)
|
||||||
|
|
||||||
|
type (
|
||||||
|
// Checker maintains the state of checks that need to be run.
|
||||||
|
Checker struct {
|
||||||
|
db *sql.DB
|
||||||
|
id int // id is the resolved checker id for this instance.
|
||||||
|
executor func(Check, context.Context) CheckResult
|
||||||
|
timeout time.Duration
|
||||||
|
ident string // the host identifier
|
||||||
|
}
|
||||||
|
|
||||||
|
CheckerConfig struct {
|
||||||
|
// CheckerID is used to find the checks that need to be run by this
|
||||||
|
// instance.
|
||||||
|
CheckerID int
|
||||||
|
|
||||||
|
// DB is the connection to the database to use.
|
||||||
|
DB *sql.DB
|
||||||
|
|
||||||
|
// Timeout is the duration a check has time to run.
|
||||||
|
// Set this to a reasonable value for all checks to avoid long running
|
||||||
|
// checks blocking the execution.
|
||||||
|
Timeout time.Duration
|
||||||
|
|
||||||
|
// Executor receives a check and must run the requested command in the
|
||||||
|
// time of the context.
|
||||||
|
// At the end it must return a CheckResult.
|
||||||
|
Executor func(Check, context.Context) CheckResult
|
||||||
|
|
||||||
|
// HostIdentifier is used in notifications to point to the source of the
|
||||||
|
// notification.
|
||||||
|
HostIdentifier string
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check is contains the metadata to run a check and its current state.
|
||||||
|
Check struct {
|
||||||
|
// Command is the command to run as stored in the database.
|
||||||
|
Command []string
|
||||||
|
// ExitCodes contains the list of exit codes of past runs.
|
||||||
|
ExitCodes []int
|
||||||
|
|
||||||
|
id int64 // the check instance id
|
||||||
|
mappingId int // ID to map the result for this check
|
||||||
|
}
|
||||||
|
|
||||||
|
// CheckResult is the result of a check. It may contain a message
|
||||||
|
// and must contain an exit code.
|
||||||
|
// The exit code should conform to the nagios specification of
|
||||||
|
// 0 - okay
|
||||||
|
// 1 - error
|
||||||
|
// 2 - warning
|
||||||
|
// 3 - unknown or executor errors
|
||||||
|
// Other codes are also okay and may be mapped to different values, but
|
||||||
|
// need further configuration in the system.
|
||||||
|
CheckResult struct {
|
||||||
|
ExitCode int
|
||||||
|
Message string // Message will be shown in the frontend for context
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
func NewChecker(cfg CheckerConfig) (*Checker, error) {
|
||||||
|
c := &Checker{db: cfg.DB,
|
||||||
|
executor: cfg.Executor,
|
||||||
|
timeout: cfg.Timeout,
|
||||||
|
ident: cfg.HostIdentifier,
|
||||||
|
}
|
||||||
|
if c.executor == nil {
|
||||||
|
return nil, fmt.Errorf("executor must not be nil")
|
||||||
|
}
|
||||||
|
|
||||||
|
return c, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Next pulls the next check in line and runs the set executor.
|
||||||
|
// The result is then updated in the database and a notification generated.
|
||||||
|
func (c *Checker) Next() error {
|
||||||
|
check := Check{}
|
||||||
|
tx, err := c.db.Begin()
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("could not start database transaction: %w", err)
|
||||||
|
}
|
||||||
|
defer tx.Rollback()
|
||||||
|
err = tx.
|
||||||
|
QueryRow(`select check_id, cmdLine, states, mapping_id
|
||||||
|
from active_checks
|
||||||
|
where next_time < now()
|
||||||
|
and enabled
|
||||||
|
and checker_id = $1
|
||||||
|
order by next_time
|
||||||
|
for update skip locked
|
||||||
|
limit 1;`, c.id).
|
||||||
|
Scan(&check.id, &check.Command, &check.ExitCodes, &check.mappingId)
|
||||||
|
if err != nil {
|
||||||
|
if err == sql.ErrNoRows {
|
||||||
|
return ErrNoCheck
|
||||||
|
}
|
||||||
|
return fmt.Errorf("could not get next check: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), c.timeout)
|
||||||
|
defer cancel()
|
||||||
|
result := c.executor(check, ctx)
|
||||||
|
if ctx.Err() == context.DeadlineExceeded {
|
||||||
|
result.Message = fmt.Sprintf("check took longer than %s", c.timeout)
|
||||||
|
result.ExitCode = 2
|
||||||
|
}
|
||||||
|
|
||||||
|
backToOkay := false
|
||||||
|
if len(check.ExitCodes) == 0 && result.ExitCode == 0 {
|
||||||
|
backToOkay = true
|
||||||
|
} else if len(check.ExitCodes) > 0 && check.ExitCodes[0] > 0 && result.ExitCode == 0 {
|
||||||
|
backToOkay = true
|
||||||
|
}
|
||||||
|
|
||||||
|
if _, err := tx.Exec(`update active_checks ac
|
||||||
|
set next_time = now() + intval, states = ARRAY[$2::int] || states[1:4],
|
||||||
|
msg = $3,
|
||||||
|
acknowledged = case when $4 then false else acknowledged end,
|
||||||
|
state_since = case $2 when states[1] then state_since else now() end
|
||||||
|
where check_id = $1`, check.id, result.ExitCode, result.Message, backToOkay); err != nil {
|
||||||
|
return fmt.Errorf("could not update check '%d': %w", check.id, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if _, err := tx.Exec(`insert into notifications(check_id, states, output, mapping_id, notifier_id, check_host)
|
||||||
|
select $1, array_agg(ml.target), $2, $3, cn.notifier_id, $4
|
||||||
|
from active_checks ac
|
||||||
|
cross join lateral unnest(ac.states) s
|
||||||
|
join checks_notify cn on ac.check_id = cn.check_id
|
||||||
|
join mapping_level ml on ac.mapping_id = ml.mapping_id and s.s = ml.source
|
||||||
|
where ac.check_id = $1
|
||||||
|
and ac.acknowledged = false
|
||||||
|
and cn.enabled = true
|
||||||
|
group by cn.notifier_id;`, check.id, result.Message, check.mappingId, c.ident); err != nil {
|
||||||
|
return fmt.Errorf("could not create notification '%d': %s", check.id, err)
|
||||||
|
}
|
||||||
|
tx.Commit()
|
||||||
|
return nil
|
||||||
|
}
|
Loading…
Reference in New Issue