From 0639a504ebcc48030f8eefd80839738cfb89a583 Mon Sep 17 00:00:00 2001 From: Gibheer Date: Thu, 2 Dec 2021 15:55:48 +0100 Subject: [PATCH] add basic infrastructure for writing a checker These are the first components to interact with the database in a predefined way to write a checker instance. CheckExec serves as an example implementing the Executor interface. --- executor.go | 38 +++++++++++++ monzero.go | 150 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 188 insertions(+) create mode 100644 executor.go create mode 100644 monzero.go diff --git a/executor.go b/executor.go new file mode 100644 index 0000000..ca8e5e2 --- /dev/null +++ b/executor.go @@ -0,0 +1,38 @@ +package monzero + +import ( + "bytes" + "context" + "fmt" + "os/exec" + "syscall" +) + +// CheckExec runs a command line string. +// The output is recorded completely and returned as one message. +func CheckExec(check Check, ctx context.Context) CheckResult { + result := CheckResult{} + + cmd := exec.CommandContext(ctx, check.Command[0], check.Command[1:]...) + output := bytes.NewBuffer([]byte{}) + cmd.Stdout = output + cmd.Stderr = output + err := cmd.Run() + if err != nil { + if cmd.ProcessState == nil { + result.Message = fmt.Sprintf("unknown error when running command: %w", err) + result.ExitCode = 3 + return result + } + + status, ok := cmd.ProcessState.Sys().(syscall.WaitStatus) + if !ok { + result.Message = fmt.Sprintf("error running check: %w", err) + result.ExitCode = 2 + } else { + result.ExitCode = status.ExitStatus() + } + } + result.Message = output.String() + return result +} diff --git a/monzero.go b/monzero.go new file mode 100644 index 0000000..f2481a5 --- /dev/null +++ b/monzero.go @@ -0,0 +1,150 @@ +package monzero + +import ( + "context" + "database/sql" + "fmt" + "time" +) + +var ( + ErrNoCheck = fmt.Errorf("no check found to run") +) + +type ( + // Checker maintains the state of checks that need to be run. + Checker struct { + db *sql.DB + id int // id is the resolved checker id for this instance. + executor func(Check, context.Context) CheckResult + timeout time.Duration + ident string // the host identifier + } + + CheckerConfig struct { + // CheckerID is used to find the checks that need to be run by this + // instance. + CheckerID int + + // DB is the connection to the database to use. + DB *sql.DB + + // Timeout is the duration a check has time to run. + // Set this to a reasonable value for all checks to avoid long running + // checks blocking the execution. + Timeout time.Duration + + // Executor receives a check and must run the requested command in the + // time of the context. + // At the end it must return a CheckResult. + Executor func(Check, context.Context) CheckResult + + // HostIdentifier is used in notifications to point to the source of the + // notification. + HostIdentifier string + } + + // Check is contains the metadata to run a check and its current state. + Check struct { + // Command is the command to run as stored in the database. + Command []string + // ExitCodes contains the list of exit codes of past runs. + ExitCodes []int + + id int64 // the check instance id + mappingId int // ID to map the result for this check + } + + // CheckResult is the result of a check. It may contain a message + // and must contain an exit code. + // The exit code should conform to the nagios specification of + // 0 - okay + // 1 - error + // 2 - warning + // 3 - unknown or executor errors + // Other codes are also okay and may be mapped to different values, but + // need further configuration in the system. + CheckResult struct { + ExitCode int + Message string // Message will be shown in the frontend for context + } +) + +func NewChecker(cfg CheckerConfig) (*Checker, error) { + c := &Checker{db: cfg.DB, + executor: cfg.Executor, + timeout: cfg.Timeout, + ident: cfg.HostIdentifier, + } + if c.executor == nil { + return nil, fmt.Errorf("executor must not be nil") + } + + return c, nil +} + +// Next pulls the next check in line and runs the set executor. +// The result is then updated in the database and a notification generated. +func (c *Checker) Next() error { + check := Check{} + tx, err := c.db.Begin() + if err != nil { + return fmt.Errorf("could not start database transaction: %w", err) + } + defer tx.Rollback() + err = tx. + QueryRow(`select check_id, cmdLine, states, mapping_id + from active_checks + where next_time < now() + and enabled + and checker_id = $1 + order by next_time + for update skip locked + limit 1;`, c.id). + Scan(&check.id, &check.Command, &check.ExitCodes, &check.mappingId) + if err != nil { + if err == sql.ErrNoRows { + return ErrNoCheck + } + return fmt.Errorf("could not get next check: %w", err) + } + + ctx, cancel := context.WithTimeout(context.Background(), c.timeout) + defer cancel() + result := c.executor(check, ctx) + if ctx.Err() == context.DeadlineExceeded { + result.Message = fmt.Sprintf("check took longer than %s", c.timeout) + result.ExitCode = 2 + } + + backToOkay := false + if len(check.ExitCodes) == 0 && result.ExitCode == 0 { + backToOkay = true + } else if len(check.ExitCodes) > 0 && check.ExitCodes[0] > 0 && result.ExitCode == 0 { + backToOkay = true + } + + if _, err := tx.Exec(`update active_checks ac + set next_time = now() + intval, states = ARRAY[$2::int] || states[1:4], + msg = $3, + acknowledged = case when $4 then false else acknowledged end, + state_since = case $2 when states[1] then state_since else now() end + where check_id = $1`, check.id, result.ExitCode, result.Message, backToOkay); err != nil { + return fmt.Errorf("could not update check '%d': %w", check.id, err) + } + + if _, err := tx.Exec(`insert into notifications(check_id, states, output, mapping_id, notifier_id, check_host) + select $1, array_agg(ml.target), $2, $3, cn.notifier_id, $4 + from active_checks ac + cross join lateral unnest(ac.states) s + join checks_notify cn on ac.check_id = cn.check_id + join mapping_level ml on ac.mapping_id = ml.mapping_id and s.s = ml.source + where ac.check_id = $1 + and ac.acknowledged = false + and cn.enabled = true + group by cn.notifier_id;`, check.id, result.Message, check.mappingId, c.ident); err != nil { + return fmt.Errorf("could not create notification '%d': %s", check.id, err) + } + tx.Commit() + return nil +}