diff options
author | Gibheer <gibheer+git@zero-knowledge.org> | 2021-12-02 15:55:48 +0100 |
---|---|---|
committer | Gibheer <gibheer+git@zero-knowledge.org> | 2021-12-02 15:55:48 +0100 |
commit | 0639a504ebcc48030f8eefd80839738cfb89a583 (patch) | |
tree | f7d8ad5e16f995088c48fc2aec4341bf4784666e /monzero.go | |
parent | e2b479c34fcee4c41009fc9e8fe869831fbd494f (diff) |
add basic infrastructure for writing a checker
These are the first components to interact with the database in a
predefined way to write a checker instance.
CheckExec serves as an example implementing the Executor interface.
Diffstat (limited to 'monzero.go')
-rw-r--r-- | monzero.go | 150 |
1 files changed, 150 insertions, 0 deletions
diff --git a/monzero.go b/monzero.go new file mode 100644 index 0000000..f2481a5 --- /dev/null +++ b/monzero.go @@ -0,0 +1,150 @@ +package monzero + +import ( + "context" + "database/sql" + "fmt" + "time" +) + +var ( + ErrNoCheck = fmt.Errorf("no check found to run") +) + +type ( + // Checker maintains the state of checks that need to be run. + Checker struct { + db *sql.DB + id int // id is the resolved checker id for this instance. + executor func(Check, context.Context) CheckResult + timeout time.Duration + ident string // the host identifier + } + + CheckerConfig struct { + // CheckerID is used to find the checks that need to be run by this + // instance. + CheckerID int + + // DB is the connection to the database to use. + DB *sql.DB + + // Timeout is the duration a check has time to run. + // Set this to a reasonable value for all checks to avoid long running + // checks blocking the execution. + Timeout time.Duration + + // Executor receives a check and must run the requested command in the + // time of the context. + // At the end it must return a CheckResult. + Executor func(Check, context.Context) CheckResult + + // HostIdentifier is used in notifications to point to the source of the + // notification. + HostIdentifier string + } + + // Check is contains the metadata to run a check and its current state. + Check struct { + // Command is the command to run as stored in the database. + Command []string + // ExitCodes contains the list of exit codes of past runs. + ExitCodes []int + + id int64 // the check instance id + mappingId int // ID to map the result for this check + } + + // CheckResult is the result of a check. It may contain a message + // and must contain an exit code. + // The exit code should conform to the nagios specification of + // 0 - okay + // 1 - error + // 2 - warning + // 3 - unknown or executor errors + // Other codes are also okay and may be mapped to different values, but + // need further configuration in the system. + CheckResult struct { + ExitCode int + Message string // Message will be shown in the frontend for context + } +) + +func NewChecker(cfg CheckerConfig) (*Checker, error) { + c := &Checker{db: cfg.DB, + executor: cfg.Executor, + timeout: cfg.Timeout, + ident: cfg.HostIdentifier, + } + if c.executor == nil { + return nil, fmt.Errorf("executor must not be nil") + } + + return c, nil +} + +// Next pulls the next check in line and runs the set executor. +// The result is then updated in the database and a notification generated. +func (c *Checker) Next() error { + check := Check{} + tx, err := c.db.Begin() + if err != nil { + return fmt.Errorf("could not start database transaction: %w", err) + } + defer tx.Rollback() + err = tx. + QueryRow(`select check_id, cmdLine, states, mapping_id + from active_checks + where next_time < now() + and enabled + and checker_id = $1 + order by next_time + for update skip locked + limit 1;`, c.id). + Scan(&check.id, &check.Command, &check.ExitCodes, &check.mappingId) + if err != nil { + if err == sql.ErrNoRows { + return ErrNoCheck + } + return fmt.Errorf("could not get next check: %w", err) + } + + ctx, cancel := context.WithTimeout(context.Background(), c.timeout) + defer cancel() + result := c.executor(check, ctx) + if ctx.Err() == context.DeadlineExceeded { + result.Message = fmt.Sprintf("check took longer than %s", c.timeout) + result.ExitCode = 2 + } + + backToOkay := false + if len(check.ExitCodes) == 0 && result.ExitCode == 0 { + backToOkay = true + } else if len(check.ExitCodes) > 0 && check.ExitCodes[0] > 0 && result.ExitCode == 0 { + backToOkay = true + } + + if _, err := tx.Exec(`update active_checks ac + set next_time = now() + intval, states = ARRAY[$2::int] || states[1:4], + msg = $3, + acknowledged = case when $4 then false else acknowledged end, + state_since = case $2 when states[1] then state_since else now() end + where check_id = $1`, check.id, result.ExitCode, result.Message, backToOkay); err != nil { + return fmt.Errorf("could not update check '%d': %w", check.id, err) + } + + if _, err := tx.Exec(`insert into notifications(check_id, states, output, mapping_id, notifier_id, check_host) + select $1, array_agg(ml.target), $2, $3, cn.notifier_id, $4 + from active_checks ac + cross join lateral unnest(ac.states) s + join checks_notify cn on ac.check_id = cn.check_id + join mapping_level ml on ac.mapping_id = ml.mapping_id and s.s = ml.source + where ac.check_id = $1 + and ac.acknowledged = false + and cn.enabled = true + group by cn.notifier_id;`, check.id, result.Message, check.mappingId, c.ident); err != nil { + return fmt.Errorf("could not create notification '%d': %s", check.id, err) + } + tx.Commit() + return nil +} |