aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGibheer <gibheer+git@zero-knowledge.org>2018-11-16 10:39:21 +0100
committerGibheer <gibheer+git@zero-knowledge.org>2018-11-16 10:39:21 +0100
commitbfd2b5d324d7348eff22c4a991fa0f21e09360f5 (patch)
tree263bf67bd215345e9be2b794d0ad0771e9de6f32
initial commit
-rw-r--r--README.md42
-rw-r--r--cmd/moncheck/main.go141
-rw-r--r--go.mod3
-rw-r--r--go.sum2
-rw-r--r--moncheck.conf4
-rw-r--r--schema/20181116.sql78
6 files changed, 270 insertions, 0 deletions
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..40f0987
--- /dev/null
+++ b/README.md
@@ -0,0 +1,42 @@
+monzero
+=======
+
+Monzero is a collection of tools with the purpose of running monitoring checks
+and triggering notifications.
+
+requirements
+------------
+
+runtime requirements:
+* PostgreSQL >= 10.0
+
+build requirements:
+* Go >= 1.11
+
+components
+----------
+
+The following components exist:
+
+### moncheck
+
+Moncheck is the daemon that runs the checks and generates notifications in the
+database.
+It is possible to run multiple instances of moncheck, as it uses PostgreSQL
+as a coordinator through the PostgreSQL internal locking mechanism.
+
+Moncheck uses the table `active_checks` to detect which checks to run.
+
+### monfront
+
+Monfront is a webfrontend to view the current state of all checks, configure
+hosts, groups, checks and view current notifications.
+It is possible to run multiple instances.
+
+### monwork
+
+Monwork is a small server that does all the maintenance work in the background.
+It is responsible to cleanup the history and generate the configuration.
+
+The configuration is generated into `active_checks` when an entry in `nodes`,
+`command` or `checks` was changed (detected through the updated column).
diff --git a/cmd/moncheck/main.go b/cmd/moncheck/main.go
new file mode 100644
index 0000000..ca31a02
--- /dev/null
+++ b/cmd/moncheck/main.go
@@ -0,0 +1,141 @@
+package main
+
+import (
+ "bytes"
+ "context"
+ "database/sql"
+ "encoding/json"
+ "flag"
+ "io/ioutil"
+ "log"
+ "os/exec"
+ "sync"
+ "syscall"
+ "time"
+
+ "github.com/lib/pq"
+)
+
+var (
+ configPath = flag.String("config", "moncheck.conf", "path to the config file")
+)
+
+type (
+ Config struct {
+ DB string `json:"db"`
+ Wait string `json:"wait"`
+ }
+)
+
+func main() {
+ flag.Parse()
+
+ raw, err := ioutil.ReadFile(*configPath)
+ if err != nil {
+ log.Fatalf("could not read config: %s", err)
+ }
+ config := Config{}
+ if err := json.Unmarshal(raw, &config); err != nil {
+ log.Fatalf("could not parse config: %s", err)
+ }
+
+ waitDuration, err := time.ParseDuration(config.Wait)
+ if err != nil {
+ log.Fatalf("could not parse wait duration: %s", err)
+ }
+
+ db, err := sql.Open("postgres", config.DB)
+ if err != nil {
+ log.Fatalf("could not open database connection: %s", err)
+ }
+
+ for i := 0; i < 25; i++ {
+ go check(i, db, waitDuration)
+ }
+ wg := sync.WaitGroup{}
+ wg.Add(1)
+ wg.Wait()
+}
+
+func check(thread int, db *sql.DB, waitDuration time.Duration) {
+ for {
+ tx, err := db.Begin()
+ if err != nil {
+ log.Printf("[%d] could not start transaction: %s", thread, err)
+ continue
+ }
+ rows, err := tx.Query("select check_id, check_id, cmdLine, states, next_time from active_checks where next_time < now() and enabled order by next_time for update skip locked limit 1;")
+ if err != nil {
+ log.Printf("[%d] could not start query: %s", thread, err)
+ tx.Rollback()
+ continue
+ }
+ var (
+ id int64
+ cmdLine []string
+ states []int64
+ notify bool
+ )
+ found := false
+ for rows.Next() {
+ if err := rows.Err(); err != nil {
+ log.Printf("could not fetch row: %s", err)
+ tx.Rollback()
+ break
+ }
+ if err := rows.Scan(&id, pq.Array(&cmdLine), pq.Array(&states), &notify); err != nil {
+ log.Printf("could not scan values: %s", err)
+ tx.Rollback()
+ break
+ }
+ found = true
+ }
+ if !found {
+ time.Sleep(waitDuration)
+ tx.Rollback()
+ continue
+ }
+ statePos := 5
+ if len(states) < 6 {
+ statePos = len(states)
+ }
+ ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second)
+ cmd := exec.CommandContext(ctx, cmdLine[0], cmdLine[1:]...)
+ output := bytes.NewBuffer([]byte{})
+ cmd.Stdout = output
+ cmd.Stderr = output
+ if err != nil && err == context.Canceled {
+ log.Printf("[%d] check took too long: %s", id, err)
+ // TODO which state to choose?
+ // TODO add notification handler
+ // TODO all this casting should be done better
+ states = append([]int64{1}, states[:statePos]...)
+ } else if err != nil {
+ cancel()
+ exitErr, ok := err.(*exec.ExitError)
+ if !ok {
+ log.Printf("[%d]error running check: %s", id, err)
+ states = append([]int64{1}, states[:statePos]...)
+ } else {
+ status, ok := exitErr.Sys().(syscall.WaitStatus)
+ if !ok {
+ } else {
+ states = append([]int64{int64(status.ExitStatus())}, states[:statePos]...)
+ }
+ }
+ } else {
+ cancel()
+ states = append([]int64{0}, states[:statePos]...)
+ }
+
+ if _, err := tx.Exec("update active_checks set next_time = now() + interval, states = $2 where check_id = $1", id, pq.Array(&states)); err != nil {
+ log.Printf("[%d] could not update row '%d': %s", thread, id, err)
+ }
+ if notify {
+ if _, err := tx.Exec("insert into notifications(check_id, states, output) values ($1, $2, $3);", id, pq.Array(&states), &output); err != nil {
+ log.Printf("[%d] could not create notification for '%d': %s", thread, id, err)
+ }
+ }
+ tx.Commit()
+ }
+}
diff --git a/go.mod b/go.mod
new file mode 100644
index 0000000..b0633ef
--- /dev/null
+++ b/go.mod
@@ -0,0 +1,3 @@
+module git.zero-knowledge.org/gibheer/monzero
+
+require github.com/lib/pq v1.0.0
diff --git a/go.sum b/go.sum
new file mode 100644
index 0000000..56d4251
--- /dev/null
+++ b/go.sum
@@ -0,0 +1,2 @@
+github.com/lib/pq v1.0.0 h1:X5PMW56eZitiTeO7tKzZxFCSpbFZJtkMMooicw2us9A=
+github.com/lib/pq v1.0.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo=
diff --git a/moncheck.conf b/moncheck.conf
new file mode 100644
index 0000000..5cee7a7
--- /dev/null
+++ b/moncheck.conf
@@ -0,0 +1,4 @@
+{
+ "db": "user=zeromon password=zeromonmonmonmon sslmode=disable dbname=monzero",
+ "wait": "5s"
+}
diff --git a/schema/20181116.sql b/schema/20181116.sql
new file mode 100644
index 0000000..cb97acf
--- /dev/null
+++ b/schema/20181116.sql
@@ -0,0 +1,78 @@
+BEGIN;
+CREATE TABLE notifier (
+ id serial NOT NULL primary key,
+ name text NOT NULL
+);
+
+CREATE TABLE groups (
+ id serial NOT NULL primary key,
+ name text NOT NULL
+);
+
+CREATE TABLE nodes (
+ id bigserial NOT NULL primary key,
+ name text NOT NULL,
+ updated timestamp with time zone DEFAULT now() NOT NULL,
+ created timestamp with time zone DEFAULT now() NOT NULL,
+ message text NOT NULL
+);
+
+CREATE TABLE nodes_groups (
+ node_id bigint not null references nodes(id) on delete cascade,
+ group_id int not null references groups(id) on delete cascade,
+ unique(node_id, group_id)
+);
+
+CREATE TABLE commands (
+ id serial NOT NULL primary key,
+ name text NOT NULL,
+ command text NOT NULL,
+ updated timestamp with time zone DEFAULT now() NOT NULL,
+ created timestamp with time zone DEFAULT now() NOT NULL,
+ message text NOT NULL
+);
+
+CREATE TABLE checks (
+ id bigserial NOT NULL primary key,
+ node_id integer references nodes(id) on delete cascade,
+ command_id integer references commands(id) on delete cascade,
+ intval interval DEFAULT '00:05:00'::interval NOT NULL,
+ options jsonb DEFAULT '{}'::jsonb NOT NULL,
+ updated timestamp with time zone DEFAULT now() NOT NULL,
+ last_refresh timestamp with time zone,
+ enabled boolean not null DEFAULT true,
+ notifier_id integer NOT NULL,
+ notify boolean DEFAULT true NOT NULL,
+ message text NOT NULL,
+ unique(node_id, command_id)
+);
+
+CREATE TABLE notifications (
+ id bigserial NOT NULL primary key,
+ check_id bigint not null unique references checks(id) on delete cascade,
+ states integer[] not null,
+ output text,
+ inserted timestamp with time zone DEFAULT now() NOT NULL,
+ sent timestamp with time zone
+);
+
+CREATE TABLE active_checks (
+ check_id bigint NOT NULL references checks(id) on delete cascade,
+ cmdline text[] NOT NULL,
+ next_time timestamp with time zone DEFAULT now() NOT NULL,
+ states integer[] DEFAULT ARRAY[0] NOT NULL,
+ intval interval NOT NULL,
+ enabled boolean NOT NULL,
+ notify boolean NOT NULL,
+ notice text
+);
+
+CREATE INDEX ON active_checks(next_time) WHERE enabled;
+CREATE INDEX ON checks(updated, last_refresh nulls first);
+CREATE INDEX ON checks(node_id);
+CREATE INDEX ON checks(command_id);
+CREATE INDEX ON notifications(inserted) WHERE sent is null;
+CREATE INDEX ON notifications (check_id, inserted desc);
+CREATE INDEX ON commands(updated);
+CREATE INDEX ON nodes(updated);
+COMMIT;