]> git.8kb.co.uk Git - postgresql/encoding_woes/commitdiff
Initial commit
authorglyn <glyn@8kb.co.uk>
Tue, 11 Mar 2014 17:43:26 +0000 (17:43 +0000)
committerglyn <glyn@8kb.co.uk>
Tue, 11 Mar 2014 17:43:26 +0000 (17:43 +0000)
.gitignore [new file with mode: 0644]
codepage_850.cs [new file with mode: 0644]
codepage_850.plperlu [new file with mode: 0644]
codepage_850.plpythonu [new file with mode: 0644]

diff --git a/.gitignore b/.gitignore
new file mode 100644 (file)
index 0000000..ff37810
--- /dev/null
@@ -0,0 +1,2 @@
+.DS_Store
+Thumb.db
diff --git a/codepage_850.cs b/codepage_850.cs
new file mode 100644 (file)
index 0000000..a5cf91d
--- /dev/null
@@ -0,0 +1,87 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+using Npgsql;
+using System.Text.RegularExpressions;
+
+namespace EncTest
+{
+    class Program
+    {
+        private static Encoding _eCp850 = Encoding.GetEncoding(850);
+        private static Encoding _eUnicode = Encoding.UTF8;
+        private static Encoding _eLatin1 = Encoding.GetEncoding("ISO-8859-1");
+
+        public static string encode_cp850(string sText)
+        {
+            string sReturn;
+            byte[] bSource;
+            byte[] bTarget;
+
+            bSource = _eUnicode.GetBytes(sText);
+            bTarget = Encoding.Convert(_eUnicode, _eCp850, bSource);
+            sReturn = _eLatin1.GetString(bTarget);
+
+            return sReturn;
+        }
+
+        public static string decode_cp850(byte[] sTextAsBytea)
+        {
+            string sReturn;
+            byte[] bSource = sTextAsBytea;
+            byte[] bTarget;
+
+            bTarget = Encoding.Convert(_eCp850, _eUnicode, bSource);
+            sReturn = _eUnicode.GetString(bTarget);
+
+            return sReturn;
+        }
+
+        static void Main(string[] args)
+        {
+
+            NpgsqlConnection conn;
+            NpgsqlTransaction tran;
+            NpgsqlCommand command;
+            String sIn;
+
+            conn = new NpgsqlConnection("Server=localhost;Port=5432;User Id=myuser;Password=mypassword;Database=mydatabase;Timeout=600;Pooling=false;ApplicationName=enctest");
+            conn.Open();
+            tran = conn.BeginTransaction();
+
+            /////// Writing cp850
+            NpgsqlParameter add1 = new NpgsqlParameter("add1", NpgsqlTypes.NpgsqlDbType.Text);
+            command = new NpgsqlCommand("INSERT INTO myfile (add1) VALUES (:add1);", conn, tran);
+            add1.Value = encode_cp850("Schwanthaler Straße 75a");
+
+            command.Parameters.Add(add1);
+            command.ExecuteNonQuery();
+
+            /////// Reading cp850
+            //Read the data as bytea so as not to automatically convert into UTF-16
+            command = new NpgsqlCommand("select add1::bytea from myfile order by id desc limit 2;", conn, tran);
+            NpgsqlDataReader dr = command.ExecuteReader();
+            while (dr.Read())
+            {
+                for (int i = 0; i < dr.FieldCount; i++)
+                {
+                    // We now have bytea rather than string, but we can decode it easily enough
+                    byte[] bSource = (byte[])dr[i];
+                    String reEncodedString = decode_cp850(bSource);
+                    Console.Write("Value: {0} \n", reEncodedString);
+                }
+                Console.WriteLine();
+            }
+
+            tran.Commit();
+            conn.Close();
+
+            conn.Open();
+
+            String keypress = Console.ReadLine().ToUpper();
+
+        }        
+    }
+}
diff --git a/codepage_850.plperlu b/codepage_850.plperlu
new file mode 100644 (file)
index 0000000..b931375
--- /dev/null
@@ -0,0 +1,124 @@
+--
+-- Glyn Astill 09/02/2014
+-- Plperl untrusted functions to do conversion from DOS cp850 native through Latin1 to UTF8 and vice versa
+--
+
+-- The following functions require the user to know what their client_encoiding is and call the appropriate function
+
+DROP FUNCTION IF EXISTS cp850_to_latin1(text);
+CREATE OR REPLACE FUNCTION cp850_to_latin1(text)
+RETURNS text AS
+$BODY$
+       use Encode;
+        return encode( 'utf8', decode('cp850', $_[0] ));
+$BODY$
+LANGUAGE 'plperlu' IMMUTABLE;
+
+COMMENT ON FUNCTION public.cp850_to_latin1(text) IS 'Converts text from cp850 to current latin1';
+
+--
+
+DROP FUNCTION IF EXISTS cp850_to_utf8(text);
+CREATE OR REPLACE FUNCTION cp850_to_utf8(text)
+RETURNS text AS
+$BODY$
+       use Encode;
+        return encode( 'iso-8859-1', decode('cp850', $_[0] ));
+$BODY$
+LANGUAGE 'plperlu' IMMUTABLE;
+
+COMMENT ON FUNCTION public.cp850_to_utf8(text) IS 'Converts text from cp850 to current utf8';
+
+--
+
+DROP FUNCTION IF EXISTS latin1_to_cp850(text);
+CREATE OR REPLACE FUNCTION latin1_to_cp850(text)
+RETURNS text AS
+$BODY$
+       use Encode;
+        return encode( 'cp850', decode('utf8', $_[0] ));
+$BODY$
+LANGUAGE 'plperlu' IMMUTABLE;
+
+COMMENT ON FUNCTION public.latin1_to_cp850(text) IS 'Converts text from current latin1 to cp850';
+
+--
+
+DROP FUNCTION IF EXISTS utf8_to_cp850(text);
+CREATE OR REPLACE FUNCTION utf8_to_cp850(text)
+RETURNS text AS
+$BODY$
+       use Encode;
+        return encode( 'cp850', decode('iso-8859-1', $_[0] ));
+$BODY$
+LANGUAGE 'plperlu' IMMUTABLE;
+
+COMMENT ON FUNCTION public.utf8_to_cp850(text) IS 'Converts text from current utf8 to cp850';
+
+-- The following functions do not require the user to know what their client_encoiding is
+
+DROP FUNCTION IF EXISTS public.to_cp850(text);
+CREATE OR REPLACE FUNCTION public.to_cp850(text)
+RETURNS text AS
+$BODY$
+       use Encode;
+       my $string;
+       my $rv = spi_exec_query('SELECT pg_client_encoding();', 1);
+       my $encoding = $rv->{rows}[0]->{pg_client_encoding};
+       
+       eval {
+               if ($encoding eq 'UTF8') {
+                       $string = encode( 'cp850', decode('iso-8859-1', $_[0] ));
+               }
+               elsif (($encoding eq 'LATIN1') || ($encoding eq 'SQL_ASCII')){
+                       $string = encode( 'cp850', decode('utf8', $_[0] ));
+               }
+               else {
+                       elog(ERROR, "to_cp850 currently does not support client_encoding $encoding");
+               }
+       };
+       if ($@) {
+               elog(WARNING, "Re-encoding failed, stripping non alphanumerics: $@");
+               $string = $_[0] =~ s/[\x01-\x7f]/?/g;
+       }
+        return $string;
+$BODY$
+LANGUAGE 'plperlu' IMMUTABLE;
+
+COMMENT ON FUNCTION public.to_cp850(text) IS 'Converts text from current client_encoding to cp850';
+
+--
+
+DROP FUNCTION IF EXISTS public.from_cp850(text);
+CREATE OR REPLACE FUNCTION public.from_cp850(text)
+RETURNS text AS
+$BODY$
+       use Encode;
+       my $string;
+       my $rv = spi_exec_query('SELECT pg_client_encoding();', 1);
+       my $encoding = $rv->{rows}[0]->{pg_client_encoding};
+       
+       eval {
+               if ($encoding eq 'UTF8') {
+                       $string = encode( 'iso-8859-1', decode('cp850', $_[0] ));
+               }
+               elsif (($encoding eq 'LATIN1') || ($encoding eq 'SQL_ASCII')){
+                       $string = encode( 'utf8', decode('cp850', $_[0] ));
+               }
+               else {
+                       elog(ERROR, "from_cp850 currently does not support client_encoding $encoding");
+               }
+       };
+       if ($@) {
+               elog(WARNING, "Re-encoding failed: $@");
+               $string = $_[0];
+       }
+        return $string;
+
+        
+$BODY$
+LANGUAGE 'plperlu' IMMUTABLE;
+
+COMMENT ON FUNCTION public.from_cp850(text) IS 'Converts text from cp850 to current client_encoding';
+
+
diff --git a/codepage_850.plpythonu b/codepage_850.plpythonu
new file mode 100644 (file)
index 0000000..59818ac
--- /dev/null
@@ -0,0 +1,54 @@
+--
+-- Glyn Astill 09/02/2014
+-- Plpython untrusted functions to do conversion from DOS cp850 native through Latin1 to UTF8 and vice versa
+--
+
+CREATE OR REPLACE FUNCTION cp850_to_utf8(in_text text) RETURNS text AS 
+$BODY$
+    rv = plpy.execute("SHOW client_encoding", 1)
+    v_encoding = rv[0]["client_encoding"]
+    if v_encoding.upper() != 'UTF8':
+        raise plpy.Error('Sorry client_encoding should be UTF8')
+    else:
+        return unicode(in_text, 'cp850').encode('iso8859_1')
+$BODY$ 
+LANGUAGE plpythonu;
+
+--
+
+CREATE OR REPLACE FUNCTION utf8_to_cp850(in_text text) RETURNS text AS 
+$BODY$
+    rv = plpy.execute("SHOW client_encoding", 1)
+    v_encoding = rv[0]["client_encoding"]
+    if v_encoding.upper() != 'UTF8':
+        raise plpy.Error('Sorry client_encoding should be UTF8')
+    else:
+        return unicode(in_text, 'iso8859_1').encode('cp850')
+$BODY$ 
+LANGUAGE plpythonu;
+
+--
+
+CREATE OR REPLACE FUNCTION cp850_to_latin1(in_text text) RETURNS text AS 
+$BODY$
+    rv = plpy.execute("SHOW client_encoding", 1)
+    v_encoding = rv[0]["client_encoding"]
+    if v_encoding.upper() != 'LATIN1':
+        raise plpy.Error('Sorry client_encoding should be LATIN1')
+    else:
+        return unicode(in_text, 'cp850').encode('utf-8')
+$BODY$ 
+LANGUAGE plpythonu;
+
+--
+
+CREATE OR REPLACE FUNCTION latin1_to_cp850(in_text text) RETURNS text AS 
+$BODY$
+    rv = plpy.execute("SHOW client_encoding", 1)
+    v_encoding = rv[0]["client_encoding"]
+    if v_encoding.upper() != 'LATIN1':
+        raise plpy.Error('Sorry client_encoding should be LATIN1')
+    else:
+        return unicode(in_text, 'utf-8').encode('cp850')
+$BODY$ 
+LANGUAGE plpythonu;