Skip to content

feat: added metric that shows bytes held by non-active replication slot #210

New issue

Have a question about this project? Sign up for a free account to open an issue and contact its maintainers and the community.

By clicking “Sign up for ”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on ? Sign in to your account

Open
wants to merge 1 commit into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 62 additions & 2 deletions documentation/metrics.md
Original file line numberDiff line numberDiff line change
Expand Up@@ -3691,7 +3691,8 @@ Default config:
### Replication

Default config:
        lag_more_than_in_sec = 300
        lag_more_than_in_sec = 300\
        critical_bytes_held_by_non_active_slot = 1073741824 bytes

### Items

Expand DownExpand Up@@ -3763,6 +3764,37 @@ Default config:

*Non-active Replication Slots* calculates as count of slots with `false` active status.

- **Bytes Held By Non-active Replication Slots**

Zabbix item:
<table>
<tr>
<th>Name</th>
<td>PostgreSQL Replication: Bytes held by non-active slot {#NON_ACTIVE_SLOT_NAME}</td>
</tr>
<tr>
<th>Key</th>
<td>pgsql.replication.non_active_slots_discovery[]</td>
</tr>
<tr>
<th>Type</th>
<td>Numeric (float)</td>
</tr>
<tr>
<th>Units</th>
<td>Bytes</td>
</tr>
<tr>
<th>Delta</th>
<td>As Is</td>
</tr>
<tr>
<th>Supported Version</th>
<td>10+</td>
</tr>
</table>

*Non-active Replication Slots* calculates as count of slots with `false` active status.

- ** Replication Lag**

Expand DownExpand Up@@ -3861,12 +3893,40 @@ Default config:
</tr>
</table>

- **PostgreSQL Replication: Non-active Slots Discovery**

Items:
<table>
<tr>
<th>Name</th>
<td>PostgreSQL Replication: Bytes held by non-active slot {#NON_ACTIVE_SLOT_NAME}</td>
</tr>
<tr>
<th>Key</th>
<td>pgsql.replication.non_active_slots_discovery[]</td>
</tr>
<tr>
<th>Type</th>
<td>Numeric (float)</td>
</tr>
<tr>
<th>Units</th>
<td>Bytes</td>
</tr>
<tr>
<th>Delta</th>
<td>As Is</td>
</tr>
</table>

### Triggers

- **PostgreSQL Instance: server mode has been changed on {HOSTNAME} to {ITEM.LASTVALUE}**

- **PostgreSQL number of non-active replication slots on {HOSTNAME} (value={ITEM.LASTVALUE})**

Disabled by default
- **PostgreSQL Replication: bytes held by slot {#NON_ACTIVE_SLOT_NAME} is too high (value={ITEM.LASTVALUE})**
Triggers if *PostgreSQL Replication: Bytes held by non-active slot {#NON_ACTIVE_SLOT_NAME}* exceeds `critical_bytes_held_by_non_active_slot`.
- **PostgreSQL lag too high on {HOSTNAME} (value={ITEM.LASTVALUE})**
Triggers if *PostgreSQL Replication: Replication Lag* exceeds `lag_more_than_in_sec`.

Expand Down
2 changes: 2 additions & 0 deletions mamonsu/lib/default_config.py
Original file line numberDiff line numberDiff line change
Expand Up@@ -35,6 +35,8 @@ def default_host():
host = os.environ.get('PGHOST') or 'auto'
if platform.FREEBSD:
host = os.environ.get('PGHOST') or 'auto'
if platform.DARWIN:
host = os.environ.get('PGHOST') or 'auto'
return host

@staticmethod
Expand Down
1 change: 1 addition & 0 deletions mamonsu/lib/platform.py
Original file line numberDiff line numberDiff line change
Expand Up@@ -3,5 +3,6 @@
LINUX = (sys.platform == 'linux' or sys.platform == 'linux2')
WINDOWS = (sys.platform == 'win32' or sys.platform == 'win64')
FREEBSD = ('freebsd' in sys.platform)
DARWIN = sys.platform == 'darwin'
UNIX = LINUX or FREEBSD
INTEGER_TYPES = int,
26 changes: 25 additions & 1 deletion mamonsu/plugins/pgsql/driver/pool.py
Original file line numberDiff line numberDiff line change
Expand Up@@ -86,7 +86,7 @@ class Pool(object):
"""
SELECT application_name,
{0}
coalesce((pg_{1}_{2}_diff(pg_current_{1}_{2}(), replay_lsn))::int, 0) AS total_lag
coalesce((pg_{1}_{2}_diff(pg_current_{1}_{2}(), replay_{2}))::int, 0) AS total_lag
FROM pg_stat_replication;
""",
"""
Expand All@@ -95,6 +95,30 @@ class Pool(object):
total_lag
FROM mamonsu.count_{1}_lag_lsn();
"""
),
"wal_held_bytes_master": (
"""
SELECT slot_name,
coalesce((pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn))::int, 0) AS wal_held_bytes
FROM pg_replication_slots;
""",
"""
SELECT slot_name,
wal_held_bytes
FROM mamonsu.bytes_held_by_inactive_slot_on_master();
"""
),
"wal_held_bytes_replica": (
"""
SELECT slot_name,
coalesce((pg_wal_lsn_diff(pg_last_wal_replay_lsn(), restart_lsn))::int, 0) AS wal_held_bytes
FROM pg_replication_slots;
""",
"""
SELECT slot_name,
wal_held_bytes
FROM mamonsu.bytes_held_by_inactive_slot_on_replica();
"""
)
}

Expand Down
69 changes: 65 additions & 4 deletions mamonsu/plugins/pgsql/replication.py
Original file line numberDiff line numberDiff line change
Expand Up@@ -13,7 +13,8 @@ class Replication(Plugin):
AgentPluginType = "pg"
# key: (macro, value)
plugin_macros = {
"critical_lag_seconds": [("macro", "{$CRITICAL_LAG_SECONDS}"), ("value", 60 * 5)]
"critical_lag_seconds": [("macro", "{$CRITICAL_LAG_SECONDS}"), ("value", 60 * 5)],
"critical_bytes_held_by_none_active_slot": [("macro", "{$CRITICAL_BYTES_HELD_BY_NON_ACTIVE_SLOT}"), ("value", 1024 * 1024 * 1024)]
}

# get time of replication lag
Expand All@@ -30,8 +31,15 @@ class Replication(Plugin):
WHERE active = 'false';
"""

query_bytes_held_by_non_active_slot = """
SELECT slot_name, coalesce(pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::bigint, 0) AS wal_size_bytes
FROM pg_replication_slots
WHERE active = 'false';
"""

# for discovery rule for name of each replica
key_lsn_replication_discovery = "pgsql.replication.discovery{0}"
key_replication_non_active_slots_discovery = "pgsql.replication.non_active_slots_discovery{0}"
key_total_lag = "pgsql.replication.total_lag{0}"
# for PG 10 and higher
key_flush = "pgsql.replication.flush_lag{0}"
Expand All@@ -42,6 +50,7 @@ class Replication(Plugin):

key_replication = "pgsql.replication_lag{0}"
key_non_active_slots = "pgsql.replication.non_active_slots{0}"
key_non_active_slots_held_bytes = "pgsql.replication.non_active_slots_held_bytes{0}"

def run(self, zbx):

Expand DownExpand Up@@ -79,6 +88,14 @@ def run(self, zbx):
zbx.send("pgsql.replication.replay_lag[{0}]".format(info[0]), float(info[5]))
zbx.send("pgsql.replication.discovery[]", zbx.json({"data": lags}))
del lags
bytes_held_by_non_active_slot = Pooler.run_sql_type("wal_held_bytes_master", args=[])
if bytes_held_by_non_active_slot:
discovery = []
for info in bytes_held_by_non_active_slot:
discovery.append({"{#NON_ACTIVE_SLOT_NAME}": info[0]})
zbx.send("pgsql.replication.non_active_slots_held_bytes[{0}]".format(info[0]), int(info[1]))
zbx.send("pgsql.replication.non_active_slots_discovery[]", zbx.json({"data": discovery}))
del discovery
elif Pooler.is_superuser() or Pooler.is_bootstraped():
result_lags = Pooler.run_sql_type("wal_lag_lsn", args=[" ", "xlog", "location"])
if result_lags:
Expand All@@ -90,7 +107,15 @@ def run(self, zbx):
del lags
else:
self.disable_and_exit_if_not_superuser()

else:
bytes_held_by_non_active_slot = Pooler.run_sql_type("wal_held_bytes_replica", args=[])
if bytes_held_by_non_active_slot:
discovery = []
for info in bytes_held_by_non_active_slot:
discovery.append({"{#NON_ACTIVE_SLOT_NAME}": info[0]})
zbx.send("pgsql.replication.non_active_slots_held_bytes[{0}]".format(info[0]), int(info[1]))
zbx.send("pgsql.replication.non_active_slots_discovery[]", zbx.json({"data": discovery}))
del discovery
non_active_slots = Pooler.query(self.query_non_active_slots)
zbx.send(self.key_non_active_slots.format("[]"), int(non_active_slots[0][0]))

Expand DownExpand Up@@ -132,7 +157,8 @@ def triggers(self, template, dasard=False):
}) + template.trigger({
"name": "PostgreSQL Replication: number of non-active replication slots on {HOSTNAME} (value={ITEM.LASTVALUE})",
"expression": "{#TEMPLATE:" + self.right_type(self.key_non_active_slots) + ".last()}&gt;" + str(
NUMBER_NON_ACTIVE_SLOTS)
NUMBER_NON_ACTIVE_SLOTS),
"status": 1
})
return triggers

Expand DownExpand Up@@ -198,7 +224,42 @@ def discovery_rules(self, template, dasard=False):
]
}
]
return template.discovery_rule(rule=rule, conditions=conditions, items=items, graphs=graphs)
active_slots_discovery_rule = template.discovery_rule(rule=rule, conditions=conditions, items=items, graphs=graphs)

rule = {
"name": "PostgreSQL Replication: Non Active Slots Discovery",
"key": self.key_replication_non_active_slots_discovery.format("[{0}]".format(self.Macros[self.Type]))
}
if Plugin.old_zabbix:
conditions = []
rule["filter"] = "{#NON_ACTIVE_SLOT_NAME}:.*"
else:
conditions = [{
"condition": [
{"macro": "{#NON_ACTIVE_SLOT_NAME}",
"value": ".*",
"operator": 8,
"formulaid": "A"}
]
}]
items = [
{"key": self.right_type(self.key_non_active_slots_held_bytes, var_discovery="{#NON_ACTIVE_SLOT_NAME},"),
"name": "PostgreSQL Replication: Bytes held by non-active slot {#NON_ACTIVE_SLOT_NAME}",
"value_type": Plugin.VALUE_TYPE.numeric_float,
"delay": self.plugin_config("interval"),
"drawtype": 2}
]
graphs = []
triggers = [
{
"name": "PostgreSQL Replication: bytes held by slot {#NON_ACTIVE_SLOT_NAME} is too high (value={ITEM.LASTVALUE})",
"expression": "{#TEMPLATE:" + self.right_type(self.key_non_active_slots_held_bytes, var_discovery="{#NON_ACTIVE_SLOT_NAME},") + ".last()}&gt;" +
self.plugin_macros["critical_bytes_held_by_none_active_slot"][0][1]
}
]
non_active_slots_discovery_rule = template.discovery_rule(rule=rule, conditions=conditions, items=items, graphs=graphs, triggers=triggers)

return active_slots_discovery_rule + non_active_slots_discovery_rule

def keys_and_queries(self, template_zabbix):
result = []
Expand Down
17 changes: 17 additions & 0 deletions mamonsu/tools/bootstrap/sql.py
Original file line numberDiff line numberDiff line change
Expand Up@@ -236,6 +236,23 @@
coalesce((pg_{7}_diff(pg_current_{7}(), replay_{9}))::bigint, 0) AS total_lag
FROM pg_stat_replication
$$ LANGUAGE SQL SECURITY DEFINER;

DROP FUNCTION IF EXISTS mamonsu.bytes_held_by_inactive_slot_on_master();
CREATE OR REPLACE FUNCTION mamonsu.bytes_held_by_inactive_slot_on_master()
RETURNS TABLE(slot_name TEXT, wal_held_bytes BIGINT) AS $$
SELECT slot_name::TEXT, coalesce((pg_{7}_diff(pg_current_wal_lsn(), restart_lsn))::bigint, 0) AS wal_held_bytes
FROM pg_replication_slots
WHERE active = 'false'
$$ LANGUAGE SQL SECURITY DEFINER;

DROP FUNCTION IF EXISTS mamonsu.bytes_held_by_inactive_slot_on_replica();
CREATE OR REPLACE FUNCTION mamonsu.bytes_held_by_inactive_slot_on_replica()
RETURNS TABLE(slot_name TEXT, wal_held_bytes BIGINT) AS $$
SELECT slot_name::TEXT, coalesce((pg_{7}_diff(pg_last_wal_replay_lsn(), restart_lsn))::bigint, 0) AS wal_held_bytes
FROM pg_replication_slots
WHERE active = 'false'
$$ LANGUAGE SQL SECURITY DEFINER;

"""

CreatePgBuffercacheFunctionsSQL = """
Expand Down