add basic infrastructure for writing a checker

These are the first components to interact with the database in a predefined way to write a checker instance. CheckExec serves as an example implementing the Executor interface.
2021-12-02 15:55:48 +01:00 · 2021-12-02 15:55:48 +01:00 · 0639a504eb
parent e2b479c34f
commit 0639a504eb
2 changed files with 188 additions and 0 deletions
--- a/executor.go
+++ b/executor.go
@ -0,0 +1,38 @@
+package monzero
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"os/exec"
+	"syscall"
+)
+
+// CheckExec runs a command line string.
+// The output is recorded completely and returned as one message.
+func CheckExec(check Check, ctx context.Context) CheckResult {
+	result := CheckResult{}
+
+	cmd := exec.CommandContext(ctx, check.Command[0], check.Command[1:]...)
+	output := bytes.NewBuffer([]byte{})
+	cmd.Stdout = output
+	cmd.Stderr = output
+	err := cmd.Run()
+	if err != nil {
+		if cmd.ProcessState == nil {
+			result.Message = fmt.Sprintf("unknown error when running command: %w", err)
+			result.ExitCode = 3
+			return result
+		}
+
+		status, ok := cmd.ProcessState.Sys().(syscall.WaitStatus)
+		if !ok {
+			result.Message = fmt.Sprintf("error running check: %w", err)
+			result.ExitCode = 2
+		} else {
+			result.ExitCode = status.ExitStatus()
+		}
+	}
+	result.Message = output.String()
+	return result
+}
--- a/monzero.go
+++ b/monzero.go
@ -0,0 +1,150 @@
+package monzero
+
+import (
+	"context"
+	"database/sql"
+	"fmt"
+	"time"
+)
+
+var (
+	ErrNoCheck = fmt.Errorf("no check found to run")
+)
+
+type (
+	// Checker maintains the state of checks that need to be run.
+	Checker struct {
+		db       *sql.DB
+		id       int // id is the resolved checker id for this instance.
+		executor func(Check, context.Context) CheckResult
+		timeout  time.Duration
+		ident    string // the host identifier
+	}
+
+	CheckerConfig struct {
+		// CheckerID is used to find the checks that need to be run by this
+		// instance.
+		CheckerID int
+
+		// DB is the connection to the database to use.
+		DB *sql.DB
+
+		// Timeout is the duration a check has time to run.
+		// Set this to a reasonable value for all checks to avoid long running
+		// checks blocking the execution.
+		Timeout time.Duration
+
+		// Executor receives a check and must run the requested command in the
+		// time of the context.
+		// At the end it must return a CheckResult.
+		Executor func(Check, context.Context) CheckResult
+
+		// HostIdentifier is used in notifications to point to the source of the
+		// notification.
+		HostIdentifier string
+	}
+
+	// Check is contains the metadata to run a check and its current state.
+	Check struct {
+		// Command is the command to run as stored in the database.
+		Command []string
+		// ExitCodes contains the list of exit codes of past runs.
+		ExitCodes []int
+
+		id        int64 // the check instance id
+		mappingId int   // ID to map the result for this check
+	}
+
+	// CheckResult is the result of a check. It may contain a message
+	// and must contain an exit code.
+	// The exit code should conform to the nagios specification of
+	// 0 - okay
+	// 1 - error
+	// 2 - warning
+	// 3 - unknown or executor errors
+	// Other codes are also okay and may be mapped to different values, but
+	// need further configuration in the system.
+	CheckResult struct {
+		ExitCode int
+		Message  string // Message will be shown in the frontend for context
+	}
+)
+
+func NewChecker(cfg CheckerConfig) (*Checker, error) {
+	c := &Checker{db: cfg.DB,
+		executor: cfg.Executor,
+		timeout:  cfg.Timeout,
+		ident:    cfg.HostIdentifier,
+	}
+	if c.executor == nil {
+		return nil, fmt.Errorf("executor must not be nil")
+	}
+
+	return c, nil
+}
+
+// Next pulls the next check in line and runs the set executor.
+// The result is then updated in the database and a notification generated.
+func (c *Checker) Next() error {
+	check := Check{}
+	tx, err := c.db.Begin()
+	if err != nil {
+		return fmt.Errorf("could not start database transaction: %w", err)
+	}
+	defer tx.Rollback()
+	err = tx.
+		QueryRow(`select check_id, cmdLine, states, mapping_id
+			from active_checks
+			where next_time < now()
+				and enabled
+				and checker_id = $1
+			order by next_time
+			for update skip locked
+			limit 1;`, c.id).
+		Scan(&check.id, &check.Command, &check.ExitCodes, &check.mappingId)
+	if err != nil {
+		if err == sql.ErrNoRows {
+			return ErrNoCheck
+		}
+		return fmt.Errorf("could not get next check: %w", err)
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), c.timeout)
+	defer cancel()
+	result := c.executor(check, ctx)
+	if ctx.Err() == context.DeadlineExceeded {
+		result.Message = fmt.Sprintf("check took longer than %s", c.timeout)
+		result.ExitCode = 2
+	}
+
+	backToOkay := false
+	if len(check.ExitCodes) == 0 && result.ExitCode == 0 {
+		backToOkay = true
+	} else if len(check.ExitCodes) > 0 && check.ExitCodes[0] > 0 && result.ExitCode == 0 {
+		backToOkay = true
+	}
+
+	if _, err := tx.Exec(`update active_checks ac
+		set next_time = now() + intval, states = ARRAY[$2::int] || states[1:4],
+				msg = $3,
+				acknowledged = case when $4 then false else acknowledged end,
+				state_since = case $2 when states[1] then state_since else now() end
+			where check_id = $1`, check.id, result.ExitCode, result.Message, backToOkay); err != nil {
+		return fmt.Errorf("could not update check '%d': %w", check.id, err)
+	}
+
+	if _, err := tx.Exec(`insert into notifications(check_id, states, output, mapping_id, notifier_id, check_host)
+			select $1, array_agg(ml.target), $2, $3, cn.notifier_id, $4
+			from active_checks ac
+			cross join lateral unnest(ac.states) s
+			join checks_notify cn on ac.check_id = cn.check_id
+			join mapping_level ml on ac.mapping_id = ml.mapping_id and s.s = ml.source
+			where ac.check_id = $1
+				and ac.acknowledged = false
+				and cn.enabled = true 
+			group by cn.notifier_id;`, check.id, result.Message, check.mappingId, c.ident); err != nil {
+		return fmt.Errorf("could not create notification '%d': %s", check.id, err)
+	}
+	tx.Commit()
+	return nil
+}