From: glyn Date: Fri, 19 Dec 2014 13:24:19 +0000 (+0000) Subject: Add in actual function and create a quick 9.1+ version using format rather than append X-Git-Url: https://git.8kb.co.uk/?a=commitdiff_plain;h=854b031fba63292ea22a9d653a4f2b844fe377a3;p=postgresql%2Fpg_log_long_xact Add in actual function and create a quick 9.1+ version using format rather than append --- diff --git a/README.md b/README.md new file mode 100755 index 0000000..b5e9998 --- /dev/null +++ b/README.md @@ -0,0 +1,82 @@ +pg_log_long_xact +================ + +Pl/pgsql function to log long running transactions in PostgreSQL, can optionally output details of any blocking statements and cancel/terminate problematic statements/transactions. + +Schedule with cron or similar if required. + +Use the cancel/terminate functionality with care, and see notes about better alternatives for some use cases built into postgres. + +Usage +--------- + +To simply log long running transactions just call the function passing the shortest duration to start logging at: + +```sql +TEST=# SELECT public.pg_log_long_xact('1s'); +NOTICE: long_xact pid: 4465 duration: 431.325747 ms user: glyn application: psql client: [local] statement: in transaction +NOTICE: long_xact pid: 16532 duration: 327.438302 ms user: glyn application: psql client: [local] statement: UPDATE balls SET description = 'TEST' WHERE id = 5; + pg_log_long_xact +--------------- +(0 rows) +``` + +To log extra information about locks the long running transactions might be waiting on, supply the second argument as true (although you could log similar information by just enabling log_lock_waits): + +```sql +TEST=# SELECT public.pg_log_long_xact('1s', true); +NOTICE: long_xact pid: 4465 duration: 471.885373 ms user: glyn application: psql client: [local] statement: in transaction +NOTICE: long_xact pid: 16532 duration: 367.997928 ms user: glyn application: psql client: [local] statement: UPDATE balls SET description = 'TEST' WHERE id = 5; +NOTICE: long_xact waiter pid: 16532 blocker detail is; pid: 4465 duration: 471.885373 ms relation: any (public.balls (RowExclusiveLock)) lock type: transaction id 311824482 user: glyn application: psql client: [local] statement: in transaction + pg_log_long_xact +--------------- +(0 rows) +``` + +To set the level of raise used pass a third argument which can be any of 'debug','log','info','notice','warning' or 'text' to output as a text resultset. +(in hindsight the default should probably have been 'log') + +```sql +TEST=# \t off +Showing only tuples. +TEST=# SELECT public.pg_log_long_xact('1s', true, 'text'); + long_xact pid: 4465 duration: 574.477076 ms user: glyn application: psql client: [local] statement: in transaction + long_xact pid: 16532 duration: 470.589631 ms user: glyn application: psql client: [local] statement: UPDATE balls SET description = 'TEST' WHERE id = 5; + long_xact waiter pid: 16532 blocker detail is; pid: 4465 duration: 574.477076 ms relation: any (public.balls (RowExclusiveLock)) lock type: transaction id 311824482 user: glyn application: psql client: [local] statement: in transaction +``` + +To make it start cancelling statements exceeding a specific duration we pass a duration as the fourth argument; the first transaction exceeding this will be cancelled +on each run, with blocking statements prioritised. + + (As of pg 9.3 there's a lock_timeout parameter that will abort any statement waiting longer than the specified number of milliseconds, which is much better. Note that the difference here is that this function will attempt to abort the blocking statement rather than the waiting one.) + +```sql +TEST=# SELECT public.pg_log_long_xact('1s', true, 'text', '10 minutes'); + long_xact pid: 4465 duration: 895.57988 ms user: glyn application: psql client: [local] statement: in transaction + long_xact unable to cancel backend with pid: 4465 + long_xact pid: 16532 duration: 791.692435 ms user: glyn application: psql client: [local] statement: UPDATE balls SET description = 'TEST' WHERE id = 5; + long_xact waiter pid: 16532 blocker detail is; pid: 4465 duration: 895.57988 ms relation: any (public.balls (RowExclusiveLock)) lock type: transaction id 31182 +4482 user: glyn application: psql client: [local] statement: in transaction +``` + +We can also provide a duration as the fith argument to terminate backends with transactions exceeding this, with prioritisation as per cancelling: + +```sql +TEST=# SELECT public.pg_log_long_xact('1s', true, 'text', '10 minutes', '15 minutes'); + long_xact pid: 4465 duration: 1026.90736 ms user: glyn application: psql client: [local] statement: in transaction + long_xact terminated backend with pid: 4465 + long_xact pid: 16532 duration: 923.019915 ms user: glyn application: psql client: [local] statement: UPDATE balls SET description = 'TEST' WHERE id = 5; +``` + +By default the function only cancels/terminates the longest running blocker, or longest running transaction, we can be more forceful than that +by passing an extra parameter to tell it to cancel statements/terminate backlends of all long running transactions it finds: + +```sql +TEST=#\t off +TEST=# SELECT public.pg_log_long_xact('1s', true, 'text', '2 minutes', '3 minutes', true); + long_xact pid: 19065 duration: 187.279089 ms user: glyn application: psql client: [local] statement: in transaction + long_xact terminated backend with pid: 19065 + long_xact pid: 16532 duration: 184.251569 ms user: glyn application: psql client: [local] statement: UPDATE balls SET description = 'TEST' WHERE id = 5; + long_xact cancelled backend with pid: 16532 +(4 rows) +``` diff --git a/pg_long_xact.plpgsql b/pg_long_xact.plpgsql new file mode 100755 index 0000000..313a5ec --- /dev/null +++ b/pg_long_xact.plpgsql @@ -0,0 +1,144 @@ +-- +-- Glyn Astill 27/05/2012 +-- Function to schedule in order to log and possibly terminate long running transactions +-- + +DROP FUNCTION IF EXISTS public.pg_log_long_xact(interval, boolean, text, interval, interval, boolean); +CREATE OR REPLACE FUNCTION public.pg_log_long_xact( + in_duration interval, -- Report all transactions running longer than this + in_report_blockers boolean DEFAULT false, -- Also report detail of the blocking transaction. Default false. + in_raise_type text DEFAULT 'notice', -- Level to use with raise, can be notice/debug/log/info/warning or something else to return as text. Default notice. + in_cancel_duration interval DEFAULT null, -- Attempt to cancel the current statement of longest running transaction if running longer than this, starting with blockers. Default don't do anything. + in_terminate_duration interval DEFAULT null, -- Attempt to terminate the longest running transaction if running longer than this, starting with blockers. Default don't do anything. + in_cancel_all boolean DEFAULT false) -- When false will only attempt to cancel/terminate one at a time, when true will mercilessly cancel/terminate all. Default false. +RETURNS SETOF text AS +$BODY$ +DECLARE + v_xact record; + v_blk_xact record; + v_line text; + v_cx_attempts integer := 0; +BEGIN + FOR v_xact IN ( + SELECT procpid AS pid, + usename, + COALESCE(application_name,'[unknown]') as app, + COALESCE(client_addr || ':' || client_port, '[local]') AS client, + current_query, + current_timestamp-xact_start AS runtime_int, + extract(EPOCH FROM current_timestamp-xact_start) AS runtime, + waiting + FROM pg_stat_activity + WHERE current_timestamp-xact_start > in_duration + AND datname = current_database() + ORDER BY waiting ASC, runtime DESC + ) LOOP + v_line := E'long_xact pid: ' || v_xact.pid || ' duration: ' || v_xact.runtime || ' ms user: ' || v_xact.usename || + ' application: ' || v_xact.app || ' client: ' || v_xact.client ||' statement: ' || v_xact.current_query; + + CASE lower(in_raise_type) + WHEN 'notice' THEN RAISE NOTICE '%', v_line; + WHEN 'debug' THEN RAISE LOG '%', v_line; + WHEN 'log' THEN RAISE LOG '%', v_line; + WHEN 'info' THEN RAISE INFO '%', v_line; + WHEN 'warning' THEN RAISE WARNING '%', v_line; + ELSE RETURN NEXT v_line; + END CASE; + + -- Really better to enable log_lock_waits than use this + IF in_report_blockers AND v_xact.waiting THEN + SELECT INTO v_blk_xact extract(EPOCH FROM current_timestamp-ba.xact_start) AS blk_runtime, + b.pid AS blk_pid, + ba.usename AS blk_usename, + COALESCE(ba.application_name,'[unknown]') as blk_app, + COALESCE(ba.client_addr || ':' || ba.client_port, '[local]') AS blk_client, + ba.current_query AS blk_query, + COALESCE(t.schemaname || '.' || t.relname || ' (' || b.mode || ')', bt.locks, '[unknown]') AS blk_relation, + CASE b.locktype WHEN 'transactionid' THEN 'transaction id ' || b.transactionid ELSE b.locktype END AS blk_type + FROM pg_catalog.pg_locks l + LEFT JOIN pg_catalog.pg_locks b ON b.granted + AND ((b.locktype = 'transactionid' AND b.transactionid = l.transactionid) OR (b.locktype = 'relation' AND b.relation = l.relation)) + AND b.pid != l.pid + LEFT JOIN pg_catalog.pg_stat_activity ba ON ba.procpid = b.pid + LEFT JOIN pg_catalog.pg_stat_user_tables t ON l.relation = t.relid + LEFT JOIN (SELECT pid, 'any (' || string_agg(schemaname || '.' || relname || ' (' || mode || ')',', ') || ')' AS locks + FROM pg_locks JOIN pg_stat_user_tables ON relation = relid + WHERE mode ~ 'ExclusiveLock' + GROUP BY pid) bt ON bt.pid = b.pid + WHERE l.pid = v_xact.pid AND NOT l.granted LIMIT 1; + + IF FOUND THEN + v_line := E'long_xact waiter pid: ' || v_xact.pid || ' blocker detail is; pid: ' || v_blk_xact.blk_pid || ' duration: ' || + v_blk_xact.blk_runtime || ' ms relation: ' || v_blk_xact.blk_relation || ' lock type: ' || v_blk_xact.blk_type || + ' user: ' || v_blk_xact.blk_usename || ' application: ' || v_blk_xact.blk_app || ' client: ' || + v_blk_xact.blk_client || ' statement: ' || v_blk_xact.blk_query; + + CASE lower(in_raise_type) + WHEN 'notice' THEN RAISE NOTICE '%', v_line; + WHEN 'debug' THEN RAISE DEBUG '%', v_line; + WHEN 'log' THEN RAISE LOG '%', v_line; + WHEN 'info' THEN RAISE INFO '%', v_line; + WHEN 'warning' THEN RAISE WARNING '%', v_line; + ELSE RETURN NEXT v_line; + END CASE; + END IF; + END IF; + + -- In 9.3+ it's really better to use lock_timeout than use this + IF (v_cx_attempts = 0 OR in_cancel_all) AND in_cancel_duration IS NOT NULL AND v_xact.runtime_int > in_cancel_duration THEN + v_cx_attempts := v_cx_attempts+1; + IF v_xact.current_query <> ' in transaction' AND pg_cancel_backend(v_xact.pid) THEN + v_line := 'long_xact cancelled backend with pid: ' || v_xact.pid; + + CASE lower(in_raise_type) + WHEN 'notice' THEN RAISE NOTICE '%', v_line; + WHEN 'debug' THEN RAISE DEBUG '%', v_line; + WHEN 'log' THEN RAISE LOG '%', v_line; + WHEN 'info' THEN RAISE INFO '%', v_line; + WHEN 'warning' THEN RAISE WARNING '%', v_line; + ELSE RETURN NEXT v_line; + END CASE; + ELSIF in_terminate_duration IS NOT NULL AND v_xact.runtime_int > in_terminate_duration THEN + IF pg_terminate_backend(v_xact.pid) THEN + v_line := 'long_xact terminated backend with pid: ' || v_xact.pid; + + CASE lower(in_raise_type) + WHEN 'notice' THEN RAISE NOTICE '%', v_line; + WHEN 'debug' THEN RAISE DEBUG '%', v_line; + WHEN 'log' THEN RAISE LOG '%', v_line; + WHEN 'info' THEN RAISE INFO '%', v_line; + WHEN 'warning' THEN RAISE WARNING '%', v_line; + ELSE RETURN NEXT v_line; + END CASE; + ELSE + v_line := 'long_xact unable to terminate backend with pid: ' || v_xact.pid; + + CASE lower(in_raise_type) + WHEN 'notice' THEN RAISE NOTICE '%', v_line; + WHEN 'debug' THEN RAISE DEBUG '%', v_line; + WHEN 'log' THEN RAISE LOG '%', v_line; + WHEN 'info' THEN RAISE INFO '%', v_line; + WHEN 'warning' THEN RAISE WARNING '%', v_line; + ELSE RETURN NEXT v_line; + END CASE; + + END IF; + ELSE + v_line := 'long_xact unable to cancel backend with pid: ' || v_xact.pid; + + CASE lower(in_raise_type) + WHEN 'notice' THEN RAISE NOTICE '%', v_line; + WHEN 'debug' THEN RAISE DEBUG '%', v_line; + WHEN 'log' THEN RAISE LOG '%', v_line; + WHEN 'info' THEN RAISE INFO '%', v_line; + WHEN 'warning' THEN RAISE WARNING '%', v_line; + ELSE RETURN NEXT v_line; + END CASE; + END IF; + END IF; + END LOOP; + + RETURN; +END; +$BODY$ +LANGUAGE plpgsql VOLATILE; diff --git a/pg_long_xact_9.1+.plpgsql b/pg_long_xact_9.1+.plpgsql new file mode 100755 index 0000000..a89f093 --- /dev/null +++ b/pg_long_xact_9.1+.plpgsql @@ -0,0 +1,143 @@ +-- +-- Glyn Astill 20/12/2014 +-- Function to schedule in order to log and possibly terminate long running transactions (pg9.1+) +-- + +DROP FUNCTION IF EXISTS public.pg_log_long_xact(interval, boolean, text, interval, interval, boolean); +CREATE OR REPLACE FUNCTION public.pg_log_long_xact( + in_duration interval, -- Report all transactions running longer than this + in_report_blockers boolean DEFAULT false, -- Also report detail of the blocking transaction. Default false. + in_raise_type text DEFAULT 'notice', -- Level to use with raise, can be notice/debug/log/info/warning or something else to return as text. Default notice. + in_cancel_duration interval DEFAULT null, -- Attempt to cancel the current statement of longest running transaction if running longer than this, starting with blockers. Default don't do anything. + in_terminate_duration interval DEFAULT null, -- Attempt to terminate the longest running transaction if running longer than this, starting with blockers. Default don't do anything. + in_cancel_all boolean DEFAULT false) -- When false will only attempt to cancel/terminate one at a time, when true will mercilessly cancel/terminate all. Default false. +RETURNS SETOF text AS +$BODY$ +DECLARE + v_xact record; + v_blk_xact record; + v_line text; + v_cx_attempts integer := 0; +BEGIN + FOR v_xact IN ( + SELECT procpid AS pid, + usename, + COALESCE(application_name,'[unknown]') as app, + COALESCE(client_addr || ':' || client_port, '[local]') AS client, + current_query, + current_timestamp-xact_start AS runtime_int, + extract(EPOCH FROM current_timestamp-xact_start) AS runtime, + waiting + FROM pg_stat_activity + WHERE current_timestamp-xact_start > in_duration + AND datname = current_database() + ORDER BY waiting ASC, runtime DESC + ) LOOP + v_line := format(E'long_xact pid: %s duration: %s ms user: %s application: %s client: %s statement: %s', + v_xact.pid, v_xact.runtime, v_xact.usename, v_xact.app, v_xact.client, v_xact.current_query); + + CASE lower(in_raise_type) + WHEN 'notice' THEN RAISE NOTICE '%', v_line; + WHEN 'debug' THEN RAISE LOG '%', v_line; + WHEN 'log' THEN RAISE LOG '%', v_line; + WHEN 'info' THEN RAISE INFO '%', v_line; + WHEN 'warning' THEN RAISE WARNING '%', v_line; + ELSE RETURN NEXT v_line; + END CASE; + + -- Really better to enable log_lock_waits than use this + IF in_report_blockers AND v_xact.waiting THEN + SELECT INTO v_blk_xact extract(EPOCH FROM current_timestamp-ba.xact_start) AS blk_runtime, + b.pid AS blk_pid, + ba.usename AS blk_usename, + COALESCE(ba.application_name,'[unknown]') as blk_app, + COALESCE(ba.client_addr || ':' || ba.client_port, '[local]') AS blk_client, + ba.current_query AS blk_query, + COALESCE(t.schemaname || '.' || t.relname || ' (' || b.mode || ')', bt.locks, '[unknown]') AS blk_relation, + CASE b.locktype WHEN 'transactionid' THEN 'transaction id ' || b.transactionid ELSE b.locktype END AS blk_type + FROM pg_catalog.pg_locks l + LEFT JOIN pg_catalog.pg_locks b ON b.granted + AND ((b.locktype = 'transactionid' AND b.transactionid = l.transactionid) OR (b.locktype = 'relation' AND b.relation = l.relation)) + AND b.pid != l.pid + LEFT JOIN pg_catalog.pg_stat_activity ba ON ba.procpid = b.pid + LEFT JOIN pg_catalog.pg_stat_user_tables t ON l.relation = t.relid + LEFT JOIN (SELECT pid, 'any (' || string_agg(schemaname || '.' || relname || ' (' || mode || ')',', ') || ')' AS locks + FROM pg_locks JOIN pg_stat_user_tables ON relation = relid + WHERE mode ~ 'ExclusiveLock' + GROUP BY pid) bt ON bt.pid = b.pid + WHERE l.pid = v_xact.pid AND NOT l.granted LIMIT 1; + + IF FOUND THEN + v_line := format(E'long_xact waiter pid: %s blocker detail is; pid: %s duration: %s ms relation: %s lock type: %s user: %s application: %s client: %s statement: %s', + v_xact.pid, v_blk_xact.blk_pid, v_blk_xact.blk_runtime, v_blk_xact.blk_relation, v_blk_xact.blk_type, v_blk_xact.blk_usename, v_blk_xact.blk_app, + v_blk_xact.blk_client, v_blk_xact.blk_query); + + CASE lower(in_raise_type) + WHEN 'notice' THEN RAISE NOTICE '%', v_line; + WHEN 'debug' THEN RAISE DEBUG '%', v_line; + WHEN 'log' THEN RAISE LOG '%', v_line; + WHEN 'info' THEN RAISE INFO '%', v_line; + WHEN 'warning' THEN RAISE WARNING '%', v_line; + ELSE RETURN NEXT v_line; + END CASE; + END IF; + END IF; + + -- In 9.3+ it's really better to use lock_timeout than use this + IF (v_cx_attempts = 0 OR in_cancel_all) AND in_cancel_duration IS NOT NULL AND v_xact.runtime_int > in_cancel_duration THEN + v_cx_attempts := v_cx_attempts+1; + IF v_xact.current_query <> ' in transaction' AND pg_cancel_backend(v_xact.pid) THEN + v_line := format(E'long_xact cancelled backend with pid: %s', v_xact.pid); + + CASE lower(in_raise_type) + WHEN 'notice' THEN RAISE NOTICE '%', v_line; + WHEN 'debug' THEN RAISE DEBUG '%', v_line; + WHEN 'log' THEN RAISE LOG '%', v_line; + WHEN 'info' THEN RAISE INFO '%', v_line; + WHEN 'warning' THEN RAISE WARNING '%', v_line; + ELSE RETURN NEXT v_line; + END CASE; + ELSIF in_terminate_duration IS NOT NULL AND v_xact.runtime_int > in_terminate_duration THEN + IF pg_terminate_backend(v_xact.pid) THEN + v_line := format('long_xact terminated backend with pid: %s', v_xact.pid); + + CASE lower(in_raise_type) + WHEN 'notice' THEN RAISE NOTICE '%', v_line; + WHEN 'debug' THEN RAISE DEBUG '%', v_line; + WHEN 'log' THEN RAISE LOG '%', v_line; + WHEN 'info' THEN RAISE INFO '%', v_line; + WHEN 'warning' THEN RAISE WARNING '%', v_line; + ELSE RETURN NEXT v_line; + END CASE; + ELSE + v_line := format('long_xact unable to terminate backend with pid: %s', v_xact.pid); + + CASE lower(in_raise_type) + WHEN 'notice' THEN RAISE NOTICE '%', v_line; + WHEN 'debug' THEN RAISE DEBUG '%', v_line; + WHEN 'log' THEN RAISE LOG '%', v_line; + WHEN 'info' THEN RAISE INFO '%', v_line; + WHEN 'warning' THEN RAISE WARNING '%', v_line; + ELSE RETURN NEXT v_line; + END CASE; + + END IF; + ELSE + v_line := format('long_xact unable to cancel backend with pid: %s', v_xact.pid); + + CASE lower(in_raise_type) + WHEN 'notice' THEN RAISE NOTICE '%', v_line; + WHEN 'debug' THEN RAISE DEBUG '%', v_line; + WHEN 'log' THEN RAISE LOG '%', v_line; + WHEN 'info' THEN RAISE INFO '%', v_line; + WHEN 'warning' THEN RAISE WARNING '%', v_line; + ELSE RETURN NEXT v_line; + END CASE; + END IF; + END IF; + END LOOP; + + RETURN; +END; +$BODY$ +LANGUAGE plpgsql VOLATILE;