From: glyn Date: Fri, 13 Feb 2015 13:16:49 +0000 (+0000) Subject: Initial commit X-Git-Url: https://git.8kb.co.uk/?p=postgresql%2Fpg_update_chunks;a=commitdiff_plain;h=9385c1c3da9d914ab1c86102be39ca7704dbe33c Initial commit --- 9385c1c3da9d914ab1c86102be39ca7704dbe33c diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fafff2e --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.DS_Store +Thumbs.db diff --git a/EXAMPLE.md b/EXAMPLE.md new file mode 100755 index 0000000..51ebd11 --- /dev/null +++ b/EXAMPLE.md @@ -0,0 +1,110 @@ +Test example +============ +In reality we'd only be bothered with this on large tables + +Create a test table: + +```sql +TEST=# CREATE TABLE test(a integer PRIMARY KEY, b text); +CREATE TABLE +Time: 3.793 ms + +TEST=# INSERT INTO test SELECT i, 'SOME TEXT ' || i FROM generate_series(1,1000000) i; +INSERT 0 1000000 +Time: 11630.081 ms + +TEST=# \dt+ test + List of relations + Schema | Name | Type | Owner | Size | Description +--------+------+-------+-----------+-------+------------- + public | test | table | pgcontrol | 50 MB | +(1 row) +``` + +Update half the table, and attempt a vacuum: +```sql +TEST=# UPDATE test SET b = 'SOMETHING ELSE ' || a WHERE a > 500000; +UPDATE 500001 +Time: 7729.968 ms + +TEST=# VACUUM TEST; +VACUUM +Time: 528.894 ms + +TEST=# \dt+ test + List of relations + Schema | Name | Type | Owner | Size | Description +--------+------+-------+-----------+-------+------------- + public | test | table | pgcontrol | 78 MB | +(1 row) +``` + +We've got some bloat we can't remove easily. If we use the function to split +the update into chunks with intermediate vacuums we should have less bloat. + +Reset our test table: + +```sql +TEST=# TRUNCATE TABLE test; + +TEST=# INSERT INTO test SELECT i, 'SOME TEXT ' || i FROM generate_series(1,1000000) i; +INSERT 0 1000000 +Time: 11541.353 ms + +TEST=# \dt+ test + List of relations + Schema | Name | Type | Owner | Size | Description +--------+------+-------+-----------+-------+------------- + public | test | table | pgcontrol | 50 MB | +(1 row) +``` + +For vacuum to do it's work best we should not have a long running transaction +in the database containing the table, so best to switch to another database +before running our function: + +```sql +TEST=# \c postgres +You are now connected to database "postgres" + +postgres=# SELECT pg_chunk_update('public', 'test', 'b', E'\'SOMETHING ELSE \' || a', 'a > 500000', 200, 'dbname=TEST user=glyn'); +NOTICE: 2012-02-25 12:36:18.155259: Starting update; chunks = 200 chunk_size = 2500 +NOTICE: Equiv' full SQL: + UPDATE public.test SET (b)=('SOMETHING ELSE ' || a) WHERE a > 500000; +NOTICE: 2012-02-25 12:36:18.156007+00: Updating chunk 1 of 200 (elapsed time: 0.001303s est' remainig time: ?s) +NOTICE: 2012-02-25 12:36:20.159222+00: Chunk 1 status : UPDATE 2500 OK / VACUUM OK +NOTICE: 2012-02-25 12:36:20.159369+00: Updating chunk 2 of 200 (elapsed time: 2.004122s est' remainig time: 398.372727s) +NOTICE: 2012-02-25 12:36:22.061396+00: Chunk 2 status : UPDATE 2500 OK / VACUUM OK +NOTICE: 2012-02-25 12:36:22.061473+00: Updating chunk 3 of 200 (elapsed time: 3.906225s est' remainig time: 376.601544s) +NOTICE: 2012-02-25 12:36:23.968849+00: Chunk 3 status : UPDATE 2500 OK / VACUUM OK +NOTICE: 2012-02-25 12:36:23.968938+00: Updating chunk 4 of 200 (elapsed time: 5.813693s est' remainig time: 375.756421s) + + + +NOTICE: 2012-02-25 12:41:36.605952+00: Updating chunk 198 of 200 (elapsed time: 318.450704s est' remainig time: 5.009034s) +NOTICE: 2012-02-25 12:41:38.275669+00: Chunk 198 status : UPDATE 2500 OK / VACUUM OK +NOTICE: 2012-02-25 12:41:38.275774+00: Updating chunk 199 of 200 (elapsed time: 320.120526s est' remainig time: 3.339504s) +NOTICE: 2012-02-25 12:41:39.947854+00: Chunk 199 status : UPDATE 2500 OK / VACUUM OK +NOTICE: 2012-02-25 12:41:39.947938+00: Updating chunk 200 of 200 (elapsed time: 321.79269s est' remainig time: 1.672085s) +NOTICE: 2012-02-25 12:41:41.62008+00: Chunk 200 status : UPDATE 2500 OK / VACUUM OK +NOTICE: 2012-02-25 12:41:41.620295+00: Final update pass (elapsed time: 323.465235s est' remainig time: 0s) +NOTICE: 2012-02-25 12:41:42.520855+00: Chunk 201 status : UPDATE 0 OK / VACUUM OK + pg_chunk_update +----------------- + +(1 row) + +Time: 324902.578 ms + +postgres=# \c test + +TEST=# \dt+ test + List of relations + Schema | Name | Type | Owner | Size | Description +--------+------+-------+--------+-------+------------- + public | test | table | admins | 54 MB | +(1 row) + +``` + +It took a lot longer but we'd have had less blocking, and we've got less bloat; vacuum did it's work. diff --git a/README.md b/README.md new file mode 100755 index 0000000..00b6cdc --- /dev/null +++ b/README.md @@ -0,0 +1,35 @@ +pg_chunk_update +=============== + +Quick and dirty pl/pgsql function to split full / large table updates into +chunks and perform intermediate vacuums. + +Connects via dblink to perform individual transactions. + +Arguments +--------- + +pg_chunk_update( + in_nspname -- Schema containing the table + in_relname -- Table name + in_fields -- The field names to update comma separated * + in_values -- The values for the corresponding field names * + in_where_clause -- Any where clause for the update * + in_chunks -- Break the update into this many chunks + in_conninfo -- database conninfo to pass to dblink +) + +* Arguments for in_fields, in_values and in_where_clause are plain text and not +sanitized in any way, so ensure tight permissions to prevent sql injection. + +Usage +----- +For vacuum to do it's work best we should not have a long running transaction +in the database containing the table, so best to switch to another database: + +```sql +TEST=# \c postgres +You are now connected to database "postgres" +postgres=# SELECT pg_chunk_update('public', 'test', 'b', E'\'SOMETHING ELSE \' || a', 'a > 500000', 200, 'dbname=TEST user=glyn'); +``` + diff --git a/full_table_vs_chunks.txt b/full_table_vs_chunks.txt new file mode 100755 index 0000000..27e077e --- /dev/null +++ b/full_table_vs_chunks.txt @@ -0,0 +1,71 @@ +Before: + +SEE=# \dt+ emailad + List of relations + Schema | Name | Type | Owner | Size | Description +-----------+---------+-------+-----------+--------+------------- + customers | emailad | table | pgcontrol | 436 MB | +(1 row) + +SEE=# \di+ emailad* + List of relations + Schema | Name | Type | Owner | Table | Size | Description +-----------+--------------------------------+-------+-----------+---------+--------+------------- + customers | emailad_index00 | index | pgcontrol | emailad | 72 MB | + customers | emailad_index01 | index | pgcontrol | emailad | 144 MB | + customers | emailad_index02 | index | pgcontrol | emailad | 129 MB | + customers | emailad_index03 | index | pgcontrol | emailad | 101 MB | + customers | emailad_index04 | index | pgcontrol | emailad | 101 MB | + customers | emailad_index05 | index | pgcontrol | emailad | 101 MB | + customers | emailad_index06 | index | pgcontrol | emailad | 129 MB | + customers | emailad_index_auto_past_booker | index | pgcontrol | emailad | 144 MB | + customers | emailad_lower_address | index | pgcontrol | emailad | 144 MB | +(9 rows) + +Full update: + +SEE=# \dt+ emailad; + List of relations + Schema | Name | Type | Owner | Size | Description +-----------+---------+-------+-----------+--------+------------- + customers | emailad | table | pgcontrol | 871 MB | +(1 row) + +SEE=# \di+ emailad* + List of relations + Schema | Name | Type | Owner | Table | Size | Description +-----------+--------------------------------+-------+-----------+---------+--------+------------- + customers | emailad_index00 | index | pgcontrol | emailad | 167 MB | + customers | emailad_index01 | index | pgcontrol | emailad | 289 MB | + customers | emailad_index02 | index | pgcontrol | emailad | 328 MB | + customers | emailad_index03 | index | pgcontrol | emailad | 249 MB | + customers | emailad_index04 | index | pgcontrol | emailad | 235 MB | + customers | emailad_index05 | index | pgcontrol | emailad | 233 MB | + customers | emailad_index06 | index | pgcontrol | emailad | 308 MB | + customers | emailad_index_auto_past_booker | index | pgcontrol | emailad | 289 MB | + customers | emailad_lower_address | index | pgcontrol | emailad | 290 MB | +(9 rows) + +1000 updates with vacuums (will have to be reindexed but otherwise much better): + +SEE=# \dt+ emailad* + List of relations + Schema | Name | Type | Owner | Size | Description +-----------+---------+-------+-----------+--------+------------- + customers | emailad | table | pgcontrol | 441 MB | +(1 row) + +SEE=# \di+ emailad* + List of relations + Schema | Name | Type | Owner | Table | Size | Description +-----------+--------------------------------+-------+-----------+---------+--------+------------- + customers | emailad_index00 | index | pgcontrol | emailad | 162 MB | + customers | emailad_index01 | index | pgcontrol | emailad | 144 MB | + customers | emailad_index02 | index | pgcontrol | emailad | 267 MB | + customers | emailad_index03 | index | pgcontrol | emailad | 225 MB | + customers | emailad_index04 | index | pgcontrol | emailad | 227 MB | + customers | emailad_index05 | index | pgcontrol | emailad | 226 MB | + customers | emailad_index06 | index | pgcontrol | emailad | 250 MB | + customers | emailad_index_auto_past_booker | index | pgcontrol | emailad | 144 MB | + customers | emailad_lower_address | index | pgcontrol | emailad | 144 MB | +(9 rows) diff --git a/pg_chunk_update.plpgsql b/pg_chunk_update.plpgsql new file mode 100755 index 0000000..0aeb80c --- /dev/null +++ b/pg_chunk_update.plpgsql @@ -0,0 +1,95 @@ +-- +-- Glyn Astill 25/02/2012 +-- Function to break up large / full table updates using dblink and +-- intermediate vacuums. +-- + +SET search_path=public; + +DROP FUNCTION IF EXISTS pg_chunk_update(in_nspname text, in_relname text, in_fields text, in_values text, in_where_clause text, in_chunks integer, in_conninfo text); +CREATE OR REPLACE FUNCTION pg_chunk_update(in_nspname text, in_relname text, in_fields text, in_values text, in_where_clause text, in_chunks integer, in_conninfo text) +RETURNS void AS +$BODY$ +DECLARE + v_sql text; + v_update_status text; + v_vacuum_status text; + v_chunk_size bigint; + v_debug boolean := FALSE; + v_start_timestamp timestamp; + v_last_timestamp timestamp; +BEGIN + IF EXISTS (SELECT * FROM dblink_get_connections() WHERE ARRAY['chunk_updates'] <@ dblink_get_connections) THEN + IF (dblink_is_busy('chunk_updates') = 0) THEN + RAISE NOTICE '%: Closing non-busy dblink connection', clock_timestamp(); + PERFORM dblink_disconnect('chunk_updates'); + ELSE + RAISE EXCEPTION '%: Found busy dblink connection', clock_timestamp(); + END IF; + END IF; + + PERFORM dblink_connect('chunk_updates', in_conninfo); + + v_sql := 'SELECT count(*)/' || in_chunks || ' FROM ' || quote_ident(in_nspname) || '.' || quote_ident(in_relname) || + ' WHERE ((' || in_fields || ') IS NULL OR (' || in_fields || ') <> (' || in_values || '))' || + COALESCE(' AND ' || regexp_replace(in_where_clause, E'WHERE\\y','AND', 'i'),'') || ';'; + SELECT INTO v_chunk_size cs FROM dblink('chunk_updates', v_sql) AS result (cs bigint); + + IF (v_chunk_size > 0) THEN + + v_start_timestamp := clock_timestamp(); + RAISE NOTICE '%: Starting update; chunks = % chunk_size = %', v_start_timestamp, in_chunks, v_chunk_size; + RAISE NOTICE E'Equiv\' full SQL: \n\tUPDATE %.% SET (%)=(%) WHERE %;', + quote_ident(in_nspname), quote_ident(in_relname),in_fields,in_values, + COALESCE(regexp_replace(in_where_clause, E'WHERE\\y','AND', 'i'),''); + + FOR chunk IN 1..(in_chunks+1) LOOP + + v_sql := 'UPDATE ' || quote_ident(in_nspname) || '.' || quote_ident(in_relname) || ' a ' || + ' SET (' || in_fields || ')=(' || in_values || ')'; + IF (chunk <= in_chunks) THEN + RAISE NOTICE E'%: Updating chunk % of % (elapsed time: %s est\' remainig time: %s)', clock_timestamp(), chunk, in_chunks, + extract('epoch' from (clock_timestamp()-v_start_timestamp)), + COALESCE((extract('epoch' from (clock_timestamp()-v_last_timestamp))*(in_chunks+1-chunk))::text,'?'); + + v_sql := v_sql || ' FROM (SELECT ctid FROM ' || quote_ident(in_nspname) || '.' || quote_ident(in_relname) || + ' WHERE ((' || in_fields || ') IS NULL OR (' || in_fields || ') <> (' || in_values || '))' || + COALESCE(' AND ' || regexp_replace(in_where_clause, E'WHERE\\y','AND', 'i'),'') || + ' ORDER BY ctid LIMIT ' || v_chunk_size || ' ) b WHERE a.ctid = b.ctid AND'; + ELSE + RAISE NOTICE E'%: Final update pass (elapsed time: %s est\' remainig time: %s)', clock_timestamp(), + extract('epoch' from (clock_timestamp()-v_start_timestamp)), + COALESCE((extract('epoch' from (clock_timestamp()-v_last_timestamp))*(in_chunks+1-chunk))::text,'?'); + v_sql := v_sql || ' WHERE'; + END IF; + + v_sql := v_sql || ' ((' || in_fields || ') IS NULL OR (' || in_fields || ') <> (' || in_values || '))' || + COALESCE(' AND ' || regexp_replace(in_where_clause, E'WHERE\\y','AND', 'i'),'') || ';'; + + IF (v_debug) THEN + RAISE NOTICE '%: SQL : %', clock_timestamp(), v_sql; + END IF; + + v_last_timestamp := clock_timestamp(); + v_update_status := dblink_exec('chunk_updates', v_sql); + v_update_status := v_update_status || ' ' || dblink_error_message('chunk_updates'); + + v_sql := 'VACUUM ' || quote_ident(in_nspname) || '.' || quote_ident(in_relname) || ';'; + + IF (v_debug) THEN + RAISE NOTICE '%: SQL : %', clock_timestamp(), v_sql; + END IF; + + v_vacuum_status := dblink_exec('chunk_updates', v_sql); + v_vacuum_status := v_vacuum_status || ' ' || dblink_error_message('chunk_updates'); + + RAISE NOTICE '%: Chunk % status : % / %', clock_timestamp(), chunk, v_update_status, v_vacuum_status; + END LOOP; + ELSE + RAISE NOTICE '%: Nothing to do!', clock_timestamp(); + END IF; + + PERFORM dblink_disconnect('chunk_updates'); +END; +$BODY$ +LANGUAGE plpgsql VOLATILE; diff --git a/pg_chunk_update.sh b/pg_chunk_update.sh new file mode 100755 index 0000000..50cf856 --- /dev/null +++ b/pg_chunk_update.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +# Example script to split full table updates into chunks with a vacuum inbetween +# to try and avoid table bloat. +# Sometimes it's better to create a new table and swap them, but when that's not possible +# due to other complications something like this may do. + +## CREATE TABLE test(ax integer PRIMARY KEY, bx text); +## INSERT INTO test SELECT i, 'SOME TEXT ' || i FROM generate_series(1,1000000) i; + +psql_prefix="/usr/local/pgsql/bin" +user="pgcontrol" +database="SEE" +schema="public" +relation="test" +fields="bx" # Comma separated +values="'SOMETHING ELSE ' || ax" # Comma separated +pk="ax" +skip_pk_vals="0" # Comma separated +chunks=50 +sql="SELECT count(*)/$chunks FROM $schema.$relation WHERE $pk NOT IN ($skip_pk_vals) AND (($fields) IS NULL OR ($fields) <> ($values));" +chunk_size=`$psql_prefix/psql -U $user -d $database -tAc "$sql"` + +echo "CHUNK SIZE $chunk_size" + +for i in `seq 1 $(($chunks-1))`; do + echo "CHUNK $i)" + offset=$((($i-1)*$chunk_size)) + + sql="$(cat <<-EOF + UPDATE $schema.$relation a + SET ($fields) = ($values) + FROM (SELECT ctid FROM $schema.$relation WHERE $pk NOT IN ($skip_pk_vals) AND (($fields) IS NULL OR ($fields) <> ($values)) ORDER BY ctid LIMIT $chunk_size) b + WHERE a.ctid = b.ctid + AND (($fields) IS NULL OR ($fields) <> ($values)); +EOF + )" + echo $sql + result=`$psql_prefix/psql -U $user -d $database -c "$sql"` + echo $result + + sql="VACUUM $schema.$relation;" + echo $sql + result=`$psql_prefix/psql -U $user -d $database -c "$sql"` + echo $result +done + +sql="UPDATE $schema.$relation SET ($fields) = ($values) WHERE ($fields) <> ($values);" +echo $sql +result=`$psql_prefix/psql -U $user -d $database -c "$sql"` +echo $result + +sql="VACUUM ANALYZE $schema.$relation;" +echo $sql +result=`$psql_prefix/psql -U $user -d $database -c "$sql"` +echo $result +