From: glyn <glyn@8kb.co.uk>
Date: Fri, 13 Feb 2015 13:16:49 +0000 (+0000)
Subject: Initial commit
X-Git-Url: https://git.8kb.co.uk/?p=postgresql%2Fpg_update_chunks;a=commitdiff_plain;h=9385c1c3da9d914ab1c86102be39ca7704dbe33c

Initial commit
---

9385c1c3da9d914ab1c86102be39ca7704dbe33c
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..fafff2e
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+.DS_Store
+Thumbs.db
diff --git a/EXAMPLE.md b/EXAMPLE.md
new file mode 100755
index 0000000..51ebd11
--- /dev/null
+++ b/EXAMPLE.md
@@ -0,0 +1,110 @@
+Test example
+============
+In reality we'd only be bothered with this on large tables
+
+Create a test table:
+
+```sql
+TEST=# CREATE TABLE test(a integer PRIMARY KEY, b text);
+CREATE TABLE
+Time: 3.793 ms
+
+TEST=# INSERT INTO test SELECT i, 'SOME TEXT ' || i FROM generate_series(1,1000000) i;
+INSERT 0 1000000
+Time: 11630.081 ms
+
+TEST=# \dt+ test
+                    List of relations
+ Schema | Name | Type  |   Owner   | Size  | Description
+--------+------+-------+-----------+-------+-------------
+ public | test | table | pgcontrol | 50 MB |
+(1 row)
+```
+
+Update half the table, and attempt a vacuum:
+```sql
+TEST=# UPDATE test SET b = 'SOMETHING ELSE ' || a WHERE a > 500000;
+UPDATE 500001
+Time: 7729.968 ms
+
+TEST=# VACUUM TEST;
+VACUUM
+Time: 528.894 ms
+
+TEST=# \dt+ test
+                    List of relations
+ Schema | Name | Type  |   Owner   | Size  | Description
+--------+------+-------+-----------+-------+-------------
+ public | test | table | pgcontrol | 78 MB |
+(1 row)
+```
+
+We've got some bloat we can't remove easily.  If we use the function to split 
+the update into chunks with intermediate vacuums we should have less bloat.
+
+Reset our test table:
+
+```sql
+TEST=# TRUNCATE TABLE test;
+
+TEST=# INSERT INTO test SELECT i, 'SOME TEXT ' || i FROM generate_series(1,1000000) i;
+INSERT 0 1000000
+Time: 11541.353 ms
+
+TEST=#  \dt+ test
+                    List of relations
+ Schema | Name | Type  |   Owner   | Size  | Description
+--------+------+-------+-----------+-------+-------------
+ public | test | table | pgcontrol | 50 MB |
+(1 row)
+```
+
+For vacuum to do it's work best we should not have a long running transaction
+in the database containing the table, so best to switch to another database
+before running our function:
+
+```sql
+TEST=# \c postgres
+You are now connected to database "postgres"
+
+postgres=# SELECT pg_chunk_update('public', 'test', 'b', E'\'SOMETHING ELSE \' || a', 'a > 500000', 200, 'dbname=TEST user=glyn');
+NOTICE:  2012-02-25 12:36:18.155259: Starting update; chunks = 200 chunk_size = 2500
+NOTICE:  Equiv' full SQL:
+        UPDATE public.test SET (b)=('SOMETHING ELSE ' || a) WHERE a > 500000;
+NOTICE:  2012-02-25 12:36:18.156007+00: Updating chunk 1 of 200 (elapsed time: 0.001303s est' remainig time: ?s)
+NOTICE:  2012-02-25 12:36:20.159222+00: Chunk 1 status : UPDATE 2500 OK / VACUUM OK
+NOTICE:  2012-02-25 12:36:20.159369+00: Updating chunk 2 of 200 (elapsed time: 2.004122s est' remainig time: 398.372727s)
+NOTICE:  2012-02-25 12:36:22.061396+00: Chunk 2 status : UPDATE 2500 OK / VACUUM OK
+NOTICE:  2012-02-25 12:36:22.061473+00: Updating chunk 3 of 200 (elapsed time: 3.906225s est' remainig time: 376.601544s)
+NOTICE:  2012-02-25 12:36:23.968849+00: Chunk 3 status : UPDATE 2500 OK / VACUUM OK
+NOTICE:  2012-02-25 12:36:23.968938+00: Updating chunk 4 of 200 (elapsed time: 5.813693s est' remainig time: 375.756421s)
+
+<snip>
+
+NOTICE:  2012-02-25 12:41:36.605952+00: Updating chunk 198 of 200 (elapsed time: 318.450704s est' remainig time: 5.009034s)
+NOTICE:  2012-02-25 12:41:38.275669+00: Chunk 198 status : UPDATE 2500 OK / VACUUM OK
+NOTICE:  2012-02-25 12:41:38.275774+00: Updating chunk 199 of 200 (elapsed time: 320.120526s est' remainig time: 3.339504s)
+NOTICE:  2012-02-25 12:41:39.947854+00: Chunk 199 status : UPDATE 2500 OK / VACUUM OK
+NOTICE:  2012-02-25 12:41:39.947938+00: Updating chunk 200 of 200 (elapsed time: 321.79269s est' remainig time: 1.672085s)
+NOTICE:  2012-02-25 12:41:41.62008+00: Chunk 200 status : UPDATE 2500 OK / VACUUM OK
+NOTICE:  2012-02-25 12:41:41.620295+00: Final update pass (elapsed time: 323.465235s est' remainig time: 0s)
+NOTICE:  2012-02-25 12:41:42.520855+00: Chunk 201 status : UPDATE 0 OK / VACUUM OK
+ pg_chunk_update
+-----------------
+
+(1 row)
+
+Time: 324902.578 ms
+
+postgres=# \c test
+
+TEST=# \dt+ test
+                  List of relations
+ Schema | Name | Type  | Owner  | Size  | Description
+--------+------+-------+--------+-------+-------------
+ public | test | table | admins | 54 MB |
+(1 row)
+
+```
+
+It took a lot longer but we'd have had less blocking, and we've got less bloat; vacuum did it's work.
diff --git a/README.md b/README.md
new file mode 100755
index 0000000..00b6cdc
--- /dev/null
+++ b/README.md
@@ -0,0 +1,35 @@
+pg_chunk_update
+===============
+
+Quick and dirty pl/pgsql function to split full / large table updates into 
+chunks and perform intermediate vacuums.
+
+Connects via dblink to perform individual transactions.
+
+Arguments
+---------
+
+pg_chunk_update(
+	in_nspname              -- Schema containing the table
+	in_relname              -- Table name
+	in_fields               -- The field names to update comma separated *
+	in_values               -- The values for the corresponding field names *
+	in_where_clause         -- Any where clause for the update *
+	in_chunks               -- Break the update into this many chunks
+	in_conninfo             -- database conninfo to pass to dblink
+)
+
+* Arguments for in_fields, in_values and in_where_clause are plain text and not 
+sanitized in any way, so ensure tight permissions to prevent sql injection.
+
+Usage
+-----
+For vacuum to do it's work best we should not have a long running transaction
+in the database containing the table, so best to switch to another database:
+
+```sql
+TEST=# \c postgres
+You are now connected to database "postgres"
+postgres=# SELECT pg_chunk_update('public', 'test', 'b', E'\'SOMETHING ELSE \' || a', 'a > 500000', 200, 'dbname=TEST user=glyn');
+```
+
diff --git a/full_table_vs_chunks.txt b/full_table_vs_chunks.txt
new file mode 100755
index 0000000..27e077e
--- /dev/null
+++ b/full_table_vs_chunks.txt
@@ -0,0 +1,71 @@
+Before:
+
+SEE=# \dt+ emailad
+                       List of relations
+  Schema   |  Name   | Type  |   Owner   |  Size  | Description
+-----------+---------+-------+-----------+--------+-------------
+ customers | emailad | table | pgcontrol | 436 MB |
+(1 row)
+
+SEE=# \di+ emailad*
+                                        List of relations
+  Schema   |              Name              | Type  |   Owner   |  Table  |  Size  | Description
+-----------+--------------------------------+-------+-----------+---------+--------+-------------
+ customers | emailad_index00                | index | pgcontrol | emailad | 72 MB  |
+ customers | emailad_index01                | index | pgcontrol | emailad | 144 MB |
+ customers | emailad_index02                | index | pgcontrol | emailad | 129 MB |
+ customers | emailad_index03                | index | pgcontrol | emailad | 101 MB |
+ customers | emailad_index04                | index | pgcontrol | emailad | 101 MB |
+ customers | emailad_index05                | index | pgcontrol | emailad | 101 MB |
+ customers | emailad_index06                | index | pgcontrol | emailad | 129 MB |
+ customers | emailad_index_auto_past_booker | index | pgcontrol | emailad | 144 MB |
+ customers | emailad_lower_address          | index | pgcontrol | emailad | 144 MB |
+(9 rows)
+
+Full update:
+
+SEE=# \dt+ emailad;
+                       List of relations
+  Schema   |  Name   | Type  |   Owner   |  Size  | Description
+-----------+---------+-------+-----------+--------+-------------
+ customers | emailad | table | pgcontrol | 871 MB |
+(1 row)
+
+SEE=# \di+ emailad*
+                                        List of relations
+  Schema   |              Name              | Type  |   Owner   |  Table  |  Size  | Description
+-----------+--------------------------------+-------+-----------+---------+--------+-------------
+ customers | emailad_index00                | index | pgcontrol | emailad | 167 MB |
+ customers | emailad_index01                | index | pgcontrol | emailad | 289 MB |
+ customers | emailad_index02                | index | pgcontrol | emailad | 328 MB |
+ customers | emailad_index03                | index | pgcontrol | emailad | 249 MB |
+ customers | emailad_index04                | index | pgcontrol | emailad | 235 MB |
+ customers | emailad_index05                | index | pgcontrol | emailad | 233 MB |
+ customers | emailad_index06                | index | pgcontrol | emailad | 308 MB |
+ customers | emailad_index_auto_past_booker | index | pgcontrol | emailad | 289 MB |
+ customers | emailad_lower_address          | index | pgcontrol | emailad | 290 MB |
+(9 rows)
+
+1000 updates with vacuums (will have to be reindexed but otherwise much better):
+
+SEE=# \dt+ emailad*
+                       List of relations
+  Schema   |  Name   | Type  |   Owner   |  Size  | Description
+-----------+---------+-------+-----------+--------+-------------
+ customers | emailad | table | pgcontrol | 441 MB |
+(1 row)
+
+SEE=# \di+ emailad*
+                                        List of relations
+  Schema   |              Name              | Type  |   Owner   |  Table  |  Size  | Description
+-----------+--------------------------------+-------+-----------+---------+--------+-------------
+ customers | emailad_index00                | index | pgcontrol | emailad | 162 MB |
+ customers | emailad_index01                | index | pgcontrol | emailad | 144 MB |
+ customers | emailad_index02                | index | pgcontrol | emailad | 267 MB |
+ customers | emailad_index03                | index | pgcontrol | emailad | 225 MB |
+ customers | emailad_index04                | index | pgcontrol | emailad | 227 MB |
+ customers | emailad_index05                | index | pgcontrol | emailad | 226 MB |
+ customers | emailad_index06                | index | pgcontrol | emailad | 250 MB |
+ customers | emailad_index_auto_past_booker | index | pgcontrol | emailad | 144 MB |
+ customers | emailad_lower_address          | index | pgcontrol | emailad | 144 MB |
+(9 rows)
diff --git a/pg_chunk_update.plpgsql b/pg_chunk_update.plpgsql
new file mode 100755
index 0000000..0aeb80c
--- /dev/null
+++ b/pg_chunk_update.plpgsql
@@ -0,0 +1,95 @@
+-- 
+-- Glyn Astill 25/02/2012
+-- Function to break up large / full table updates using dblink and 
+-- intermediate vacuums.
+--
+
+SET search_path=public;	
+
+DROP FUNCTION IF EXISTS pg_chunk_update(in_nspname text, in_relname text, in_fields text, in_values text, in_where_clause text, in_chunks integer, in_conninfo text);
+CREATE OR REPLACE FUNCTION pg_chunk_update(in_nspname text, in_relname text, in_fields text, in_values text, in_where_clause text, in_chunks integer, in_conninfo text)
+RETURNS void AS
+$BODY$
+DECLARE
+	v_sql text;
+	v_update_status text;
+	v_vacuum_status text;
+	v_chunk_size bigint;
+	v_debug boolean := FALSE;
+	v_start_timestamp timestamp;
+	v_last_timestamp timestamp;
+BEGIN
+	IF EXISTS (SELECT * FROM dblink_get_connections() WHERE ARRAY['chunk_updates'] <@ dblink_get_connections) THEN
+		IF (dblink_is_busy('chunk_updates') = 0)  THEN
+			RAISE NOTICE '%: Closing non-busy dblink connection', clock_timestamp();
+			PERFORM dblink_disconnect('chunk_updates');	
+		ELSE
+			RAISE EXCEPTION '%: Found busy dblink connection', clock_timestamp();
+		END IF;
+	END IF;
+	
+	PERFORM dblink_connect('chunk_updates', in_conninfo);
+
+	v_sql := 'SELECT count(*)/' || in_chunks || ' FROM ' || quote_ident(in_nspname) || '.' || quote_ident(in_relname) || 
+		' WHERE ((' || in_fields || ') IS NULL OR (' || in_fields || ') <> (' || in_values || '))' ||
+		COALESCE(' AND ' || regexp_replace(in_where_clause, E'WHERE\\y','AND', 'i'),'') || ';';
+	SELECT INTO v_chunk_size cs FROM dblink('chunk_updates', v_sql) AS result (cs bigint);
+	
+	IF (v_chunk_size > 0) THEN
+
+		v_start_timestamp := clock_timestamp();
+		RAISE NOTICE '%: Starting update; chunks = % chunk_size = %', v_start_timestamp, in_chunks, v_chunk_size;
+		RAISE NOTICE E'Equiv\' full SQL: \n\tUPDATE %.% SET (%)=(%) WHERE %;',
+			quote_ident(in_nspname), quote_ident(in_relname),in_fields,in_values,
+			COALESCE(regexp_replace(in_where_clause, E'WHERE\\y','AND', 'i'),'');
+	
+		FOR chunk IN 1..(in_chunks+1) LOOP
+			
+			v_sql := 'UPDATE ' || quote_ident(in_nspname) || '.' || quote_ident(in_relname) || ' a ' ||
+				' SET (' || in_fields || ')=(' || in_values || ')';
+			IF (chunk <= in_chunks) THEN
+				RAISE NOTICE E'%: Updating chunk % of % (elapsed time: %s est\' remainig time: %s)', clock_timestamp(), chunk, in_chunks, 
+					extract('epoch' from (clock_timestamp()-v_start_timestamp)), 
+					COALESCE((extract('epoch' from (clock_timestamp()-v_last_timestamp))*(in_chunks+1-chunk))::text,'?');
+			
+				v_sql :=  v_sql || ' FROM (SELECT ctid FROM ' || quote_ident(in_nspname) || '.' || quote_ident(in_relname) || 
+					' WHERE ((' || in_fields || ') IS NULL OR (' || in_fields || ') <> (' || in_values || '))' ||
+					COALESCE(' AND ' || regexp_replace(in_where_clause, E'WHERE\\y','AND', 'i'),'') ||
+					' ORDER BY ctid LIMIT ' || v_chunk_size || ' ) b WHERE a.ctid = b.ctid AND';
+			ELSE
+				RAISE NOTICE E'%: Final update pass (elapsed time: %s est\' remainig time: %s)', clock_timestamp(), 
+					extract('epoch' from (clock_timestamp()-v_start_timestamp)), 
+					COALESCE((extract('epoch' from (clock_timestamp()-v_last_timestamp))*(in_chunks+1-chunk))::text,'?');
+				v_sql :=  v_sql || ' WHERE';
+			END IF;
+		
+			v_sql :=  v_sql || ' ((' || in_fields || ') IS NULL OR (' || in_fields || ') <> (' || in_values || '))' ||
+				COALESCE(' AND ' || regexp_replace(in_where_clause, E'WHERE\\y','AND', 'i'),'') || ';';
+		
+			IF (v_debug) THEN
+				RAISE NOTICE '%: SQL : %', clock_timestamp(), v_sql;                
+			END IF;
+
+			v_last_timestamp := clock_timestamp();
+			v_update_status := dblink_exec('chunk_updates', v_sql);	
+			v_update_status := v_update_status || ' ' || dblink_error_message('chunk_updates');
+		
+			v_sql := 'VACUUM ' || quote_ident(in_nspname) || '.' || quote_ident(in_relname) || ';';
+		
+			IF (v_debug) THEN
+				RAISE NOTICE '%: SQL : %', clock_timestamp(), v_sql;
+			END IF;
+		
+			v_vacuum_status := dblink_exec('chunk_updates', v_sql);	
+			v_vacuum_status := v_vacuum_status || ' ' || dblink_error_message('chunk_updates');
+			
+			RAISE NOTICE '%: Chunk % status : % / %', clock_timestamp(), chunk, v_update_status, v_vacuum_status;			
+		END LOOP;
+	ELSE 
+		RAISE NOTICE '%: Nothing to do!', clock_timestamp();
+	END IF;
+	
+	PERFORM dblink_disconnect('chunk_updates');		
+END;
+$BODY$
+LANGUAGE plpgsql VOLATILE;
diff --git a/pg_chunk_update.sh b/pg_chunk_update.sh
new file mode 100755
index 0000000..50cf856
--- /dev/null
+++ b/pg_chunk_update.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+# Example script to split full table updates into chunks with a vacuum inbetween
+# to try and avoid table bloat.
+# Sometimes it's better to create a new table and swap them, but when that's not possible
+# due to other complications something like this may do.
+
+## CREATE TABLE test(ax integer PRIMARY KEY, bx text);
+## INSERT INTO test SELECT i, 'SOME TEXT ' || i FROM generate_series(1,1000000) i;
+
+psql_prefix="/usr/local/pgsql/bin"
+user="pgcontrol"
+database="SEE"
+schema="public"
+relation="test"
+fields="bx"                             # Comma separated
+values="'SOMETHING ELSE ' || ax"        # Comma separated
+pk="ax"
+skip_pk_vals="0"                        # Comma separated
+chunks=50
+sql="SELECT count(*)/$chunks FROM $schema.$relation WHERE $pk NOT IN ($skip_pk_vals) AND (($fields) IS NULL OR ($fields) <> ($values));"
+chunk_size=`$psql_prefix/psql -U $user -d $database -tAc "$sql"`
+
+echo "CHUNK SIZE $chunk_size"
+
+for i in `seq 1 $(($chunks-1))`; do
+        echo "CHUNK $i)"
+        offset=$((($i-1)*$chunk_size))
+
+        sql="$(cat <<-EOF
+                UPDATE $schema.$relation a
+                SET ($fields) = ($values)
+                FROM (SELECT ctid FROM $schema.$relation WHERE $pk NOT IN ($skip_pk_vals) AND (($fields) IS NULL OR ($fields) <> ($values)) ORDER BY ctid LIMIT $chunk_size) b
+                WHERE a.ctid = b.ctid
+                AND (($fields) IS NULL OR ($fields) <> ($values));
+EOF
+        )"
+        echo $sql
+        result=`$psql_prefix/psql -U $user -d $database -c "$sql"`
+        echo $result
+
+        sql="VACUUM $schema.$relation;"
+        echo $sql
+        result=`$psql_prefix/psql -U $user -d $database -c "$sql"`
+        echo $result
+done
+
+sql="UPDATE $schema.$relation SET ($fields) = ($values) WHERE ($fields) <> ($values);"
+echo $sql
+result=`$psql_prefix/psql -U $user -d $database -c "$sql"`
+echo $result
+
+sql="VACUUM ANALYZE $schema.$relation;"
+echo $sql
+result=`$psql_prefix/psql -U $user -d $database -c "$sql"`
+echo $result
+