diff options
author | Gibheer <gibheer+git@zero-knowledge.org> | 2018-11-16 10:39:21 +0100 |
---|---|---|
committer | Gibheer <gibheer+git@zero-knowledge.org> | 2018-11-16 10:39:21 +0100 |
commit | bfd2b5d324d7348eff22c4a991fa0f21e09360f5 (patch) | |
tree | 263bf67bd215345e9be2b794d0ad0771e9de6f32 |
initial commit
-rw-r--r-- | README.md | 42 | ||||
-rw-r--r-- | cmd/moncheck/main.go | 141 | ||||
-rw-r--r-- | go.mod | 3 | ||||
-rw-r--r-- | go.sum | 2 | ||||
-rw-r--r-- | moncheck.conf | 4 | ||||
-rw-r--r-- | schema/20181116.sql | 78 |
6 files changed, 270 insertions, 0 deletions
diff --git a/README.md b/README.md new file mode 100644 index 0000000..40f0987 --- /dev/null +++ b/README.md @@ -0,0 +1,42 @@ +monzero +======= + +Monzero is a collection of tools with the purpose of running monitoring checks +and triggering notifications. + +requirements +------------ + +runtime requirements: +* PostgreSQL >= 10.0 + +build requirements: +* Go >= 1.11 + +components +---------- + +The following components exist: + +### moncheck + +Moncheck is the daemon that runs the checks and generates notifications in the +database. +It is possible to run multiple instances of moncheck, as it uses PostgreSQL +as a coordinator through the PostgreSQL internal locking mechanism. + +Moncheck uses the table `active_checks` to detect which checks to run. + +### monfront + +Monfront is a webfrontend to view the current state of all checks, configure +hosts, groups, checks and view current notifications. +It is possible to run multiple instances. + +### monwork + +Monwork is a small server that does all the maintenance work in the background. +It is responsible to cleanup the history and generate the configuration. + +The configuration is generated into `active_checks` when an entry in `nodes`, +`command` or `checks` was changed (detected through the updated column). diff --git a/cmd/moncheck/main.go b/cmd/moncheck/main.go new file mode 100644 index 0000000..ca31a02 --- /dev/null +++ b/cmd/moncheck/main.go @@ -0,0 +1,141 @@ +package main + +import ( + "bytes" + "context" + "database/sql" + "encoding/json" + "flag" + "io/ioutil" + "log" + "os/exec" + "sync" + "syscall" + "time" + + "github.com/lib/pq" +) + +var ( + configPath = flag.String("config", "moncheck.conf", "path to the config file") +) + +type ( + Config struct { + DB string `json:"db"` + Wait string `json:"wait"` + } +) + +func main() { + flag.Parse() + + raw, err := ioutil.ReadFile(*configPath) + if err != nil { + log.Fatalf("could not read config: %s", err) + } + config := Config{} + if err := json.Unmarshal(raw, &config); err != nil { + log.Fatalf("could not parse config: %s", err) + } + + waitDuration, err := time.ParseDuration(config.Wait) + if err != nil { + log.Fatalf("could not parse wait duration: %s", err) + } + + db, err := sql.Open("postgres", config.DB) + if err != nil { + log.Fatalf("could not open database connection: %s", err) + } + + for i := 0; i < 25; i++ { + go check(i, db, waitDuration) + } + wg := sync.WaitGroup{} + wg.Add(1) + wg.Wait() +} + +func check(thread int, db *sql.DB, waitDuration time.Duration) { + for { + tx, err := db.Begin() + if err != nil { + log.Printf("[%d] could not start transaction: %s", thread, err) + continue + } + rows, err := tx.Query("select check_id, check_id, cmdLine, states, next_time from active_checks where next_time < now() and enabled order by next_time for update skip locked limit 1;") + if err != nil { + log.Printf("[%d] could not start query: %s", thread, err) + tx.Rollback() + continue + } + var ( + id int64 + cmdLine []string + states []int64 + notify bool + ) + found := false + for rows.Next() { + if err := rows.Err(); err != nil { + log.Printf("could not fetch row: %s", err) + tx.Rollback() + break + } + if err := rows.Scan(&id, pq.Array(&cmdLine), pq.Array(&states), ¬ify); err != nil { + log.Printf("could not scan values: %s", err) + tx.Rollback() + break + } + found = true + } + if !found { + time.Sleep(waitDuration) + tx.Rollback() + continue + } + statePos := 5 + if len(states) < 6 { + statePos = len(states) + } + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) + cmd := exec.CommandContext(ctx, cmdLine[0], cmdLine[1:]...) + output := bytes.NewBuffer([]byte{}) + cmd.Stdout = output + cmd.Stderr = output + if err != nil && err == context.Canceled { + log.Printf("[%d] check took too long: %s", id, err) + // TODO which state to choose? + // TODO add notification handler + // TODO all this casting should be done better + states = append([]int64{1}, states[:statePos]...) + } else if err != nil { + cancel() + exitErr, ok := err.(*exec.ExitError) + if !ok { + log.Printf("[%d]error running check: %s", id, err) + states = append([]int64{1}, states[:statePos]...) + } else { + status, ok := exitErr.Sys().(syscall.WaitStatus) + if !ok { + } else { + states = append([]int64{int64(status.ExitStatus())}, states[:statePos]...) + } + } + } else { + cancel() + states = append([]int64{0}, states[:statePos]...) + } + + if _, err := tx.Exec("update active_checks set next_time = now() + interval, states = $2 where check_id = $1", id, pq.Array(&states)); err != nil { + log.Printf("[%d] could not update row '%d': %s", thread, id, err) + } + if notify { + if _, err := tx.Exec("insert into notifications(check_id, states, output) values ($1, $2, $3);", id, pq.Array(&states), &output); err != nil { + log.Printf("[%d] could not create notification for '%d': %s", thread, id, err) + } + } + tx.Commit() + } +} @@ -0,0 +1,3 @@ +module git.zero-knowledge.org/gibheer/monzero + +require github.com/lib/pq v1.0.0 @@ -0,0 +1,2 @@ +github.com/lib/pq v1.0.0 h1:X5PMW56eZitiTeO7tKzZxFCSpbFZJtkMMooicw2us9A= +github.com/lib/pq v1.0.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo= diff --git a/moncheck.conf b/moncheck.conf new file mode 100644 index 0000000..5cee7a7 --- /dev/null +++ b/moncheck.conf @@ -0,0 +1,4 @@ +{ + "db": "user=zeromon password=zeromonmonmonmon sslmode=disable dbname=monzero", + "wait": "5s" +} diff --git a/schema/20181116.sql b/schema/20181116.sql new file mode 100644 index 0000000..cb97acf --- /dev/null +++ b/schema/20181116.sql @@ -0,0 +1,78 @@ +BEGIN; +CREATE TABLE notifier ( + id serial NOT NULL primary key, + name text NOT NULL +); + +CREATE TABLE groups ( + id serial NOT NULL primary key, + name text NOT NULL +); + +CREATE TABLE nodes ( + id bigserial NOT NULL primary key, + name text NOT NULL, + updated timestamp with time zone DEFAULT now() NOT NULL, + created timestamp with time zone DEFAULT now() NOT NULL, + message text NOT NULL +); + +CREATE TABLE nodes_groups ( + node_id bigint not null references nodes(id) on delete cascade, + group_id int not null references groups(id) on delete cascade, + unique(node_id, group_id) +); + +CREATE TABLE commands ( + id serial NOT NULL primary key, + name text NOT NULL, + command text NOT NULL, + updated timestamp with time zone DEFAULT now() NOT NULL, + created timestamp with time zone DEFAULT now() NOT NULL, + message text NOT NULL +); + +CREATE TABLE checks ( + id bigserial NOT NULL primary key, + node_id integer references nodes(id) on delete cascade, + command_id integer references commands(id) on delete cascade, + intval interval DEFAULT '00:05:00'::interval NOT NULL, + options jsonb DEFAULT '{}'::jsonb NOT NULL, + updated timestamp with time zone DEFAULT now() NOT NULL, + last_refresh timestamp with time zone, + enabled boolean not null DEFAULT true, + notifier_id integer NOT NULL, + notify boolean DEFAULT true NOT NULL, + message text NOT NULL, + unique(node_id, command_id) +); + +CREATE TABLE notifications ( + id bigserial NOT NULL primary key, + check_id bigint not null unique references checks(id) on delete cascade, + states integer[] not null, + output text, + inserted timestamp with time zone DEFAULT now() NOT NULL, + sent timestamp with time zone +); + +CREATE TABLE active_checks ( + check_id bigint NOT NULL references checks(id) on delete cascade, + cmdline text[] NOT NULL, + next_time timestamp with time zone DEFAULT now() NOT NULL, + states integer[] DEFAULT ARRAY[0] NOT NULL, + intval interval NOT NULL, + enabled boolean NOT NULL, + notify boolean NOT NULL, + notice text +); + +CREATE INDEX ON active_checks(next_time) WHERE enabled; +CREATE INDEX ON checks(updated, last_refresh nulls first); +CREATE INDEX ON checks(node_id); +CREATE INDEX ON checks(command_id); +CREATE INDEX ON notifications(inserted) WHERE sent is null; +CREATE INDEX ON notifications (check_id, inserted desc); +CREATE INDEX ON commands(updated); +CREATE INDEX ON nodes(updated); +COMMIT; |