add basic infrastructure for writing a checker

These are the first components to interact with the database in a predefined way to write a checker instance. CheckExec serves as an example implementing the Executor interface.
2021-12-02 15:55:48 +01:00 · 2021-12-02 15:55:48 +01:00 · 0639a504eb
parent e2b479c34f
commit 0639a504eb
2 changed files with 188 additions and 0 deletions
--- a/executor.go
+++ b/executor.go
@ -0,0 +1,38 @@
 package monzero
 import (
 	"bytes"
 	"context"
 	"fmt"
 	"os/exec"
 	"syscall"
 )
 // CheckExec runs a command line string.
 // The output is recorded completely and returned as one message.
 func CheckExec(check Check, ctx context.Context) CheckResult {
 	result := CheckResult{}
 	cmd := exec.CommandContext(ctx, check.Command[0], check.Command[1:]...)
 	output := bytes.NewBuffer([]byte{})
 	cmd.Stdout = output
 	cmd.Stderr = output
 	err := cmd.Run()
 	if err != nil {
 		if cmd.ProcessState == nil {
 			result.Message = fmt.Sprintf("unknown error when running command: %w", err)
 			result.ExitCode = 3
 			return result
 		}
 		status, ok := cmd.ProcessState.Sys().(syscall.WaitStatus)
 		if !ok {
 			result.Message = fmt.Sprintf("error running check: %w", err)
 			result.ExitCode = 2
 		} else {
 			result.ExitCode = status.ExitStatus()
 		}
 	}
 	result.Message = output.String()
 	return result
 }
--- a/monzero.go
+++ b/monzero.go
@ -0,0 +1,150 @@
 package monzero
 import (
 	"context"
 	"database/sql"
 	"fmt"
 	"time"
 )
 var (
 	ErrNoCheck = fmt.Errorf("no check found to run")
 )
 type (
 	// Checker maintains the state of checks that need to be run.
 	Checker struct {
 		db       *sql.DB
 		id       int // id is the resolved checker id for this instance.
 		executor func(Check, context.Context) CheckResult
 		timeout  time.Duration
 		ident    string // the host identifier
 	}
 	CheckerConfig struct {
 		// CheckerID is used to find the checks that need to be run by this
 		// instance.
 		CheckerID int
 		// DB is the connection to the database to use.
 		DB *sql.DB
 		// Timeout is the duration a check has time to run.
 		// Set this to a reasonable value for all checks to avoid long running
 		// checks blocking the execution.
 		Timeout time.Duration
 		// Executor receives a check and must run the requested command in the
 		// time of the context.
 		// At the end it must return a CheckResult.
 		Executor func(Check, context.Context) CheckResult
 		// HostIdentifier is used in notifications to point to the source of the
 		// notification.
 		HostIdentifier string
 	}
 	// Check is contains the metadata to run a check and its current state.
 	Check struct {
 		// Command is the command to run as stored in the database.
 		Command []string
 		// ExitCodes contains the list of exit codes of past runs.
 		ExitCodes []int
 		id        int64 // the check instance id
 		mappingId int   // ID to map the result for this check
 	}
 	// CheckResult is the result of a check. It may contain a message
 	// and must contain an exit code.
 	// The exit code should conform to the nagios specification of
 	// 0 - okay
 	// 1 - error
 	// 2 - warning
 	// 3 - unknown or executor errors
 	// Other codes are also okay and may be mapped to different values, but
 	// need further configuration in the system.
 	CheckResult struct {
 		ExitCode int
 		Message  string // Message will be shown in the frontend for context
 	}
 )
 func NewChecker(cfg CheckerConfig) (*Checker, error) {
 	c := &Checker{db: cfg.DB,
 		executor: cfg.Executor,
 		timeout:  cfg.Timeout,
 		ident:    cfg.HostIdentifier,
 	}
 	if c.executor == nil {
 		return nil, fmt.Errorf("executor must not be nil")
 	}
 	return c, nil
 }
 // Next pulls the next check in line and runs the set executor.
 // The result is then updated in the database and a notification generated.
 func (c *Checker) Next() error {
 	check := Check{}
 	tx, err := c.db.Begin()
 	if err != nil {
 		return fmt.Errorf("could not start database transaction: %w", err)
 	}
 	defer tx.Rollback()
 	err = tx.
 		QueryRow(`select check_id, cmdLine, states, mapping_id
 			from active_checks
 			where next_time < now()
 				and enabled
 				and checker_id = $1
 			order by next_time
 			for update skip locked
 			limit 1;`, c.id).
 		Scan(&check.id, &check.Command, &check.ExitCodes, &check.mappingId)
 	if err != nil {
 		if err == sql.ErrNoRows {
 			return ErrNoCheck
 		}
 		return fmt.Errorf("could not get next check: %w", err)
 	}
 	ctx, cancel := context.WithTimeout(context.Background(), c.timeout)
 	defer cancel()
 	result := c.executor(check, ctx)
 	if ctx.Err() == context.DeadlineExceeded {
 		result.Message = fmt.Sprintf("check took longer than %s", c.timeout)
 		result.ExitCode = 2
 	}
 	backToOkay := false
 	if len(check.ExitCodes) == 0 && result.ExitCode == 0 {
 		backToOkay = true
 	} else if len(check.ExitCodes) > 0 && check.ExitCodes[0] > 0 && result.ExitCode == 0 {
 		backToOkay = true
 	}
 	if _, err := tx.Exec(`update active_checks ac
 		set next_time = now() + intval, states = ARRAY[$2::int] || states[1:4],
 				msg = $3,
 				acknowledged = case when $4 then false else acknowledged end,
 				state_since = case $2 when states[1] then state_since else now() end
 			where check_id = $1`, check.id, result.ExitCode, result.Message, backToOkay); err != nil {
 		return fmt.Errorf("could not update check '%d': %w", check.id, err)
 	}
 	if _, err := tx.Exec(`insert into notifications(check_id, states, output, mapping_id, notifier_id, check_host)
 			select $1, array_agg(ml.target), $2, $3, cn.notifier_id, $4
 			from active_checks ac
 			cross join lateral unnest(ac.states) s
 			join checks_notify cn on ac.check_id = cn.check_id
 			join mapping_level ml on ac.mapping_id = ml.mapping_id and s.s = ml.source
 			where ac.check_id = $1
 				and ac.acknowledged = false
 				and cn.enabled = true 
 			group by cn.notifier_id;`, check.id, result.Message, check.mappingId, c.ident); err != nil {
 		return fmt.Errorf("could not create notification '%d': %s", check.id, err)
 	}
 	tx.Commit()
 	return nil
 }