monzero/monzero.go

164 lines
4.7 KiB
Go

package monzero
import (
"context"
"database/sql"
"fmt"
"log/slog"
"time"
)
var (
ErrNoCheck = fmt.Errorf("no check found to run")
)
type (
// Checker maintains the state of checks that need to be run.
Checker struct {
db *sql.DB
id int // id is the resolved checker id for this instance.
executor func(Check, context.Context) CheckResult
timeout time.Duration
ident string // the host identifier
logger *slog.Logger
}
CheckerConfig struct {
// CheckerID is used to find the checks that need to be run by this
// instance.
CheckerID int
// DB is the connection to the database to use.
DB *sql.DB
// Timeout is the duration a check has time to run.
// Set this to a reasonable value for all checks to avoid long running
// checks blocking the execution.
Timeout time.Duration
// Executor receives a check and must run the requested command in the
// time of the context.
// At the end it must return a CheckResult.
Executor func(Check, context.Context) CheckResult
// HostIdentifier is used in notifications to point to the source of the
// notification.
HostIdentifier string
// Checker will send debug details to the logger for each command executed.
Logger *slog.Logger
}
// Check is contains the metadata to run a check and its current state.
Check struct {
// Command is the command to run as stored in the database.
Command []string
// ExitCodes contains the list of exit codes of past runs.
ExitCodes []int
id int64 // the check instance id
mappingId int // ID to map the result for this check
}
// CheckResult is the result of a check. It may contain a message
// and must contain an exit code.
// The exit code should conform to the nagios specification of
// 0 - okay
// 1 - error
// 2 - warning
// 3 - unknown or executor errors
// Other codes are also okay and may be mapped to different values, but
// need further configuration in the system.
CheckResult struct {
ExitCode int
Message string // Message will be shown in the frontend for context
}
)
func NewChecker(cfg CheckerConfig) (*Checker, error) {
c := &Checker{db: cfg.DB,
executor: cfg.Executor,
timeout: cfg.Timeout,
ident: cfg.HostIdentifier,
logger: cfg.Logger,
}
if c.executor == nil {
return nil, fmt.Errorf("executor must not be nil")
}
return c, nil
}
// Next pulls the next check in line and runs the set executor.
// The result is then updated in the database and a notification generated.
func (c *Checker) Next() error {
check := Check{}
tx, err := c.db.Begin()
if err != nil {
return fmt.Errorf("could not start database transaction: %w", err)
}
defer tx.Rollback()
err = tx.
QueryRow(`select check_id, cmdLine, states, mapping_id
from active_checks
where next_time < now()
and enabled
and checker_id = $1
order by next_time
for update skip locked
limit 1;`, c.id).
Scan(&check.id, &check.Command, &check.ExitCodes, &check.mappingId)
if err != nil {
if err == sql.ErrNoRows {
return ErrNoCheck
}
return fmt.Errorf("could not get next check: %w", err)
}
ctx, cancel := context.WithTimeout(context.Background(), c.timeout)
defer cancel()
result := c.executor(check, ctx)
if ctx.Err() == context.DeadlineExceeded {
result.Message = fmt.Sprintf("check took longer than %s", c.timeout)
result.ExitCode = 2
}
c.logger.Debug(
"check command run",
"id", check.id,
"command", check.Command,
"exit code", result.ExitCode,
"message", result.Message,
)
backToOkay := false
if len(check.ExitCodes) == 0 && result.ExitCode == 0 {
backToOkay = true
} else if len(check.ExitCodes) > 0 && check.ExitCodes[0] > 0 && result.ExitCode == 0 {
backToOkay = true
}
if _, err := tx.Exec(`update active_checks ac
set next_time = now() + intval, states = ARRAY[$2::int] || states[1:4],
msg = $3,
acknowledged = case when $4 then false else acknowledged end,
state_since = case $2 when states[1] then state_since else now() end
where check_id = $1`, check.id, result.ExitCode, result.Message, backToOkay); err != nil {
return fmt.Errorf("could not update check '%d': %w", check.id, err)
}
if _, err := tx.Exec(`insert into notifications(check_id, states, output, mapping_id, notifier_id, check_host)
select $1, array_agg(ml.target), $2, $3, cn.notifier_id, $4
from active_checks ac
cross join lateral unnest(ac.states) s
join checks_notify cn on ac.check_id = cn.check_id
join mapping_level ml on ac.mapping_id = ml.mapping_id and s.s = ml.source
where ac.check_id = $1
and ac.acknowledged = false
and cn.enabled = true
group by cn.notifier_id;`, check.id, result.Message, check.mappingId, c.ident); err != nil {
return fmt.Errorf("could not create notification '%d': %s", check.id, err)
}
tx.Commit()
return nil
}