From: glyn Date: Tue, 11 Mar 2014 17:43:26 +0000 (+0000) Subject: Initial commit X-Git-Url: https://git.8kb.co.uk/?p=postgresql%2Fencoding_woes;a=commitdiff_plain;h=bb4238ba88e7e2f7dcc22cda959132943c2eb0ed Initial commit --- bb4238ba88e7e2f7dcc22cda959132943c2eb0ed diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ff37810 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.DS_Store +Thumb.db diff --git a/codepage_850.cs b/codepage_850.cs new file mode 100644 index 0000000..a5cf91d --- /dev/null +++ b/codepage_850.cs @@ -0,0 +1,87 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Npgsql; +using System.Text.RegularExpressions; + +namespace EncTest +{ + class Program + { + private static Encoding _eCp850 = Encoding.GetEncoding(850); + private static Encoding _eUnicode = Encoding.UTF8; + private static Encoding _eLatin1 = Encoding.GetEncoding("ISO-8859-1"); + + public static string encode_cp850(string sText) + { + string sReturn; + byte[] bSource; + byte[] bTarget; + + bSource = _eUnicode.GetBytes(sText); + bTarget = Encoding.Convert(_eUnicode, _eCp850, bSource); + sReturn = _eLatin1.GetString(bTarget); + + return sReturn; + } + + public static string decode_cp850(byte[] sTextAsBytea) + { + string sReturn; + byte[] bSource = sTextAsBytea; + byte[] bTarget; + + bTarget = Encoding.Convert(_eCp850, _eUnicode, bSource); + sReturn = _eUnicode.GetString(bTarget); + + return sReturn; + } + + static void Main(string[] args) + { + + NpgsqlConnection conn; + NpgsqlTransaction tran; + NpgsqlCommand command; + String sIn; + + conn = new NpgsqlConnection("Server=localhost;Port=5432;User Id=myuser;Password=mypassword;Database=mydatabase;Timeout=600;Pooling=false;ApplicationName=enctest"); + conn.Open(); + tran = conn.BeginTransaction(); + + /////// Writing cp850 + NpgsqlParameter add1 = new NpgsqlParameter("add1", NpgsqlTypes.NpgsqlDbType.Text); + command = new NpgsqlCommand("INSERT INTO myfile (add1) VALUES (:add1);", conn, tran); + add1.Value = encode_cp850("Schwanthaler Straße 75a"); + + command.Parameters.Add(add1); + command.ExecuteNonQuery(); + + /////// Reading cp850 + //Read the data as bytea so as not to automatically convert into UTF-16 + command = new NpgsqlCommand("select add1::bytea from myfile order by id desc limit 2;", conn, tran); + NpgsqlDataReader dr = command.ExecuteReader(); + while (dr.Read()) + { + for (int i = 0; i < dr.FieldCount; i++) + { + // We now have bytea rather than string, but we can decode it easily enough + byte[] bSource = (byte[])dr[i]; + String reEncodedString = decode_cp850(bSource); + Console.Write("Value: {0} \n", reEncodedString); + } + Console.WriteLine(); + } + + tran.Commit(); + conn.Close(); + + conn.Open(); + + String keypress = Console.ReadLine().ToUpper(); + + } + } +} diff --git a/codepage_850.plperlu b/codepage_850.plperlu new file mode 100644 index 0000000..b931375 --- /dev/null +++ b/codepage_850.plperlu @@ -0,0 +1,124 @@ +-- +-- Glyn Astill 09/02/2014 +-- Plperl untrusted functions to do conversion from DOS cp850 native through Latin1 to UTF8 and vice versa +-- + +-- The following functions require the user to know what their client_encoiding is and call the appropriate function + +DROP FUNCTION IF EXISTS cp850_to_latin1(text); +CREATE OR REPLACE FUNCTION cp850_to_latin1(text) +RETURNS text AS +$BODY$ + use Encode; + return encode( 'utf8', decode('cp850', $_[0] )); +$BODY$ +LANGUAGE 'plperlu' IMMUTABLE; + +COMMENT ON FUNCTION public.cp850_to_latin1(text) IS 'Converts text from cp850 to current latin1'; + +-- + +DROP FUNCTION IF EXISTS cp850_to_utf8(text); +CREATE OR REPLACE FUNCTION cp850_to_utf8(text) +RETURNS text AS +$BODY$ + use Encode; + return encode( 'iso-8859-1', decode('cp850', $_[0] )); +$BODY$ +LANGUAGE 'plperlu' IMMUTABLE; + +COMMENT ON FUNCTION public.cp850_to_utf8(text) IS 'Converts text from cp850 to current utf8'; + +-- + +DROP FUNCTION IF EXISTS latin1_to_cp850(text); +CREATE OR REPLACE FUNCTION latin1_to_cp850(text) +RETURNS text AS +$BODY$ + use Encode; + return encode( 'cp850', decode('utf8', $_[0] )); +$BODY$ +LANGUAGE 'plperlu' IMMUTABLE; + +COMMENT ON FUNCTION public.latin1_to_cp850(text) IS 'Converts text from current latin1 to cp850'; + +-- + +DROP FUNCTION IF EXISTS utf8_to_cp850(text); +CREATE OR REPLACE FUNCTION utf8_to_cp850(text) +RETURNS text AS +$BODY$ + use Encode; + return encode( 'cp850', decode('iso-8859-1', $_[0] )); +$BODY$ +LANGUAGE 'plperlu' IMMUTABLE; + +COMMENT ON FUNCTION public.utf8_to_cp850(text) IS 'Converts text from current utf8 to cp850'; + +-- The following functions do not require the user to know what their client_encoiding is + +DROP FUNCTION IF EXISTS public.to_cp850(text); +CREATE OR REPLACE FUNCTION public.to_cp850(text) +RETURNS text AS +$BODY$ + use Encode; + my $string; + my $rv = spi_exec_query('SELECT pg_client_encoding();', 1); + my $encoding = $rv->{rows}[0]->{pg_client_encoding}; + + eval { + if ($encoding eq 'UTF8') { + $string = encode( 'cp850', decode('iso-8859-1', $_[0] )); + } + elsif (($encoding eq 'LATIN1') || ($encoding eq 'SQL_ASCII')){ + $string = encode( 'cp850', decode('utf8', $_[0] )); + } + else { + elog(ERROR, "to_cp850 currently does not support client_encoding $encoding"); + } + }; + if ($@) { + elog(WARNING, "Re-encoding failed, stripping non alphanumerics: $@"); + $string = $_[0] =~ s/[\x01-\x7f]/?/g; + } + return $string; +$BODY$ +LANGUAGE 'plperlu' IMMUTABLE; + +COMMENT ON FUNCTION public.to_cp850(text) IS 'Converts text from current client_encoding to cp850'; + +-- + +DROP FUNCTION IF EXISTS public.from_cp850(text); +CREATE OR REPLACE FUNCTION public.from_cp850(text) +RETURNS text AS +$BODY$ + use Encode; + my $string; + my $rv = spi_exec_query('SELECT pg_client_encoding();', 1); + my $encoding = $rv->{rows}[0]->{pg_client_encoding}; + + eval { + if ($encoding eq 'UTF8') { + $string = encode( 'iso-8859-1', decode('cp850', $_[0] )); + } + elsif (($encoding eq 'LATIN1') || ($encoding eq 'SQL_ASCII')){ + $string = encode( 'utf8', decode('cp850', $_[0] )); + } + else { + elog(ERROR, "from_cp850 currently does not support client_encoding $encoding"); + } + }; + if ($@) { + elog(WARNING, "Re-encoding failed: $@"); + $string = $_[0]; + } + return $string; + + +$BODY$ +LANGUAGE 'plperlu' IMMUTABLE; + +COMMENT ON FUNCTION public.from_cp850(text) IS 'Converts text from cp850 to current client_encoding'; + + diff --git a/codepage_850.plpythonu b/codepage_850.plpythonu new file mode 100644 index 0000000..59818ac --- /dev/null +++ b/codepage_850.plpythonu @@ -0,0 +1,54 @@ +-- +-- Glyn Astill 09/02/2014 +-- Plpython untrusted functions to do conversion from DOS cp850 native through Latin1 to UTF8 and vice versa +-- + +CREATE OR REPLACE FUNCTION cp850_to_utf8(in_text text) RETURNS text AS +$BODY$ + rv = plpy.execute("SHOW client_encoding", 1) + v_encoding = rv[0]["client_encoding"] + if v_encoding.upper() != 'UTF8': + raise plpy.Error('Sorry client_encoding should be UTF8') + else: + return unicode(in_text, 'cp850').encode('iso8859_1') +$BODY$ +LANGUAGE plpythonu; + +-- + +CREATE OR REPLACE FUNCTION utf8_to_cp850(in_text text) RETURNS text AS +$BODY$ + rv = plpy.execute("SHOW client_encoding", 1) + v_encoding = rv[0]["client_encoding"] + if v_encoding.upper() != 'UTF8': + raise plpy.Error('Sorry client_encoding should be UTF8') + else: + return unicode(in_text, 'iso8859_1').encode('cp850') +$BODY$ +LANGUAGE plpythonu; + +-- + +CREATE OR REPLACE FUNCTION cp850_to_latin1(in_text text) RETURNS text AS +$BODY$ + rv = plpy.execute("SHOW client_encoding", 1) + v_encoding = rv[0]["client_encoding"] + if v_encoding.upper() != 'LATIN1': + raise plpy.Error('Sorry client_encoding should be LATIN1') + else: + return unicode(in_text, 'cp850').encode('utf-8') +$BODY$ +LANGUAGE plpythonu; + +-- + +CREATE OR REPLACE FUNCTION latin1_to_cp850(in_text text) RETURNS text AS +$BODY$ + rv = plpy.execute("SHOW client_encoding", 1) + v_encoding = rv[0]["client_encoding"] + if v_encoding.upper() != 'LATIN1': + raise plpy.Error('Sorry client_encoding should be LATIN1') + else: + return unicode(in_text, 'utf-8').encode('cp850') +$BODY$ +LANGUAGE plpythonu;