add basic infrastructure for writing a checker
These are the first components to interact with the database in a predefined way to write a checker instance. CheckExec serves as an example implementing the Executor interface.
This commit is contained in:
parent
e2b479c34f
commit
0639a504eb
|
@ -0,0 +1,38 @@
|
|||
package monzero
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"syscall"
|
||||
)
|
||||
|
||||
// CheckExec runs a command line string.
|
||||
// The output is recorded completely and returned as one message.
|
||||
func CheckExec(check Check, ctx context.Context) CheckResult {
|
||||
result := CheckResult{}
|
||||
|
||||
cmd := exec.CommandContext(ctx, check.Command[0], check.Command[1:]...)
|
||||
output := bytes.NewBuffer([]byte{})
|
||||
cmd.Stdout = output
|
||||
cmd.Stderr = output
|
||||
err := cmd.Run()
|
||||
if err != nil {
|
||||
if cmd.ProcessState == nil {
|
||||
result.Message = fmt.Sprintf("unknown error when running command: %w", err)
|
||||
result.ExitCode = 3
|
||||
return result
|
||||
}
|
||||
|
||||
status, ok := cmd.ProcessState.Sys().(syscall.WaitStatus)
|
||||
if !ok {
|
||||
result.Message = fmt.Sprintf("error running check: %w", err)
|
||||
result.ExitCode = 2
|
||||
} else {
|
||||
result.ExitCode = status.ExitStatus()
|
||||
}
|
||||
}
|
||||
result.Message = output.String()
|
||||
return result
|
||||
}
|
|
@ -0,0 +1,150 @@
|
|||
package monzero
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"time"
|
||||
)
|
||||
|
||||
var (
|
||||
ErrNoCheck = fmt.Errorf("no check found to run")
|
||||
)
|
||||
|
||||
type (
|
||||
// Checker maintains the state of checks that need to be run.
|
||||
Checker struct {
|
||||
db *sql.DB
|
||||
id int // id is the resolved checker id for this instance.
|
||||
executor func(Check, context.Context) CheckResult
|
||||
timeout time.Duration
|
||||
ident string // the host identifier
|
||||
}
|
||||
|
||||
CheckerConfig struct {
|
||||
// CheckerID is used to find the checks that need to be run by this
|
||||
// instance.
|
||||
CheckerID int
|
||||
|
||||
// DB is the connection to the database to use.
|
||||
DB *sql.DB
|
||||
|
||||
// Timeout is the duration a check has time to run.
|
||||
// Set this to a reasonable value for all checks to avoid long running
|
||||
// checks blocking the execution.
|
||||
Timeout time.Duration
|
||||
|
||||
// Executor receives a check and must run the requested command in the
|
||||
// time of the context.
|
||||
// At the end it must return a CheckResult.
|
||||
Executor func(Check, context.Context) CheckResult
|
||||
|
||||
// HostIdentifier is used in notifications to point to the source of the
|
||||
// notification.
|
||||
HostIdentifier string
|
||||
}
|
||||
|
||||
// Check is contains the metadata to run a check and its current state.
|
||||
Check struct {
|
||||
// Command is the command to run as stored in the database.
|
||||
Command []string
|
||||
// ExitCodes contains the list of exit codes of past runs.
|
||||
ExitCodes []int
|
||||
|
||||
id int64 // the check instance id
|
||||
mappingId int // ID to map the result for this check
|
||||
}
|
||||
|
||||
// CheckResult is the result of a check. It may contain a message
|
||||
// and must contain an exit code.
|
||||
// The exit code should conform to the nagios specification of
|
||||
// 0 - okay
|
||||
// 1 - error
|
||||
// 2 - warning
|
||||
// 3 - unknown or executor errors
|
||||
// Other codes are also okay and may be mapped to different values, but
|
||||
// need further configuration in the system.
|
||||
CheckResult struct {
|
||||
ExitCode int
|
||||
Message string // Message will be shown in the frontend for context
|
||||
}
|
||||
)
|
||||
|
||||
func NewChecker(cfg CheckerConfig) (*Checker, error) {
|
||||
c := &Checker{db: cfg.DB,
|
||||
executor: cfg.Executor,
|
||||
timeout: cfg.Timeout,
|
||||
ident: cfg.HostIdentifier,
|
||||
}
|
||||
if c.executor == nil {
|
||||
return nil, fmt.Errorf("executor must not be nil")
|
||||
}
|
||||
|
||||
return c, nil
|
||||
}
|
||||
|
||||
// Next pulls the next check in line and runs the set executor.
|
||||
// The result is then updated in the database and a notification generated.
|
||||
func (c *Checker) Next() error {
|
||||
check := Check{}
|
||||
tx, err := c.db.Begin()
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not start database transaction: %w", err)
|
||||
}
|
||||
defer tx.Rollback()
|
||||
err = tx.
|
||||
QueryRow(`select check_id, cmdLine, states, mapping_id
|
||||
from active_checks
|
||||
where next_time < now()
|
||||
and enabled
|
||||
and checker_id = $1
|
||||
order by next_time
|
||||
for update skip locked
|
||||
limit 1;`, c.id).
|
||||
Scan(&check.id, &check.Command, &check.ExitCodes, &check.mappingId)
|
||||
if err != nil {
|
||||
if err == sql.ErrNoRows {
|
||||
return ErrNoCheck
|
||||
}
|
||||
return fmt.Errorf("could not get next check: %w", err)
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), c.timeout)
|
||||
defer cancel()
|
||||
result := c.executor(check, ctx)
|
||||
if ctx.Err() == context.DeadlineExceeded {
|
||||
result.Message = fmt.Sprintf("check took longer than %s", c.timeout)
|
||||
result.ExitCode = 2
|
||||
}
|
||||
|
||||
backToOkay := false
|
||||
if len(check.ExitCodes) == 0 && result.ExitCode == 0 {
|
||||
backToOkay = true
|
||||
} else if len(check.ExitCodes) > 0 && check.ExitCodes[0] > 0 && result.ExitCode == 0 {
|
||||
backToOkay = true
|
||||
}
|
||||
|
||||
if _, err := tx.Exec(`update active_checks ac
|
||||
set next_time = now() + intval, states = ARRAY[$2::int] || states[1:4],
|
||||
msg = $3,
|
||||
acknowledged = case when $4 then false else acknowledged end,
|
||||
state_since = case $2 when states[1] then state_since else now() end
|
||||
where check_id = $1`, check.id, result.ExitCode, result.Message, backToOkay); err != nil {
|
||||
return fmt.Errorf("could not update check '%d': %w", check.id, err)
|
||||
}
|
||||
|
||||
if _, err := tx.Exec(`insert into notifications(check_id, states, output, mapping_id, notifier_id, check_host)
|
||||
select $1, array_agg(ml.target), $2, $3, cn.notifier_id, $4
|
||||
from active_checks ac
|
||||
cross join lateral unnest(ac.states) s
|
||||
join checks_notify cn on ac.check_id = cn.check_id
|
||||
join mapping_level ml on ac.mapping_id = ml.mapping_id and s.s = ml.source
|
||||
where ac.check_id = $1
|
||||
and ac.acknowledged = false
|
||||
and cn.enabled = true
|
||||
group by cn.notifier_id;`, check.id, result.Message, check.mappingId, c.ident); err != nil {
|
||||
return fmt.Errorf("could not create notification '%d': %s", check.id, err)
|
||||
}
|
||||
tx.Commit()
|
||||
return nil
|
||||
}
|
Loading…
Reference in New Issue