diff -urN -X /usr/src/excl.diff Python-1.5.orig/Include/stringobject.h Python-1.5/Include/stringobject.h --- Python-1.5.orig/Include/stringobject.h Sat Jan 18 08:53:23 1997 +++ Python-1.5/Include/stringobject.h Wed Apr 8 07:41:36 1998 @@ -1,9 +1,11 @@ #ifndef Py_STRINGOBJECT_H #define Py_STRINGOBJECT_H + #ifdef __cplusplus extern "C" { #endif + /*********************************************************** Copyright 1991-1995 by Stichting Mathematisch Centrum, Amsterdam, The Netherlands. @@ -63,7 +65,7 @@ here (and rebuild everything!). */ #define CACHE_HASH #ifdef CACHE_HASH -#define INTERN_STRINGS + /*#define INTERN_STRINGS*/ #endif typedef struct { @@ -78,8 +80,10 @@ } PyStringObject; extern DL_IMPORT(PyTypeObject) PyString_Type; +extern DL_IMPORT(PyTypeObject) PyWideString_Type; -#define PyString_Check(op) ((op)->ob_type == &PyString_Type) +#define PyString_Check(op) ((op)->ob_type == &PyString_Type || \ + (op)->ob_type == &PyWideString_Type) extern PyObject *PyString_FromStringAndSize Py_PROTO((const char *, int)); extern PyObject *PyString_FromString Py_PROTO((const char *)); @@ -105,4 +109,7 @@ #ifdef __cplusplus } #endif + +#include "wstringobject.h" + #endif /* !Py_STRINGOBJECT_H */ diff -urN -X /usr/src/excl.diff Python-1.5.orig/Include/wstringobject.h Python-1.5/Include/wstringobject.h --- Python-1.5.orig/Include/wstringobject.h Thu Jan 1 01:00:00 1970 +++ Python-1.5/Include/wstringobject.h Sun Apr 19 09:22:08 1998 @@ -0,0 +1,110 @@ +#ifndef Py_WSTRINGOBJECT_H +#define Py_WSTRINGOBJECT_H + +#ifdef __cplusplus +extern "C" { +#endif + + /* For now we'll use 16-bit ints for wide characters */ +typedef unsigned short PyWideChar; + +typedef struct{ + PyObject_VAR_HEAD +#ifdef CACHE_HASH + long ob_shash; +#endif +#ifdef INTERN_STRINGS + PyObject *ob_sinterned; +#endif + PyWideChar string[1]; +} PyWideString; + +extern DL_IMPORT(PyTypeObject) PyWideString_Type; + +#define PyWideString_Check(v) ((v)->ob_type == &PyWideString_Type) + +extern PyObject *PyWideString_FromStringAndSize Py_PROTO((const char *, int)); +extern PyObject *PyWideString_FromString Py_PROTO((const char *)); +extern PyObject *PyWideString_FromWideCharArray Py_PROTO((PyWideChar *, int)); +extern int PyWideString_Size Py_PROTO((PyObject *)); +extern PyWideChar *PyWideString_AsString Py_PROTO((PyObject *)); +extern void PyWideString_Concat Py_PROTO((PyObject **, PyObject *)); +extern void PyWideString_ConcatAndDel Py_PROTO((PyObject **, PyObject *)); +extern int _PyWideString_Resize Py_PROTO((PyObject **, int)); +extern PyObject *PyWideString_Format Py_PROTO((PyObject *, PyObject *)); +extern PyObject *PyWideString_To8bitString Py_PROTO((PyWideString *op)); +extern PyObject *PyWideString_FromQString Py_PROTO((PyObject *op)); +extern PyObject *PyWideString_ToQString Py_PROTO((PyObject *op)); +extern PyObject *PyWideString_FromUtf8 Py_PROTO((PyObject *op)); +extern PyObject *PyWideString_ToUtf8 Py_PROTO((PyObject *op)); +extern PyObject *PyString_ToUtf8 Py_PROTO((PyObject *op)); +extern PyObject *PyWideString_Utf8TryLatin1 Py_PROTO((PyObject *op)); + +#ifdef INTERN_STRINGS +extern void PyWideString_InternInPlace Py_PROTO((PyObject **)); +extern PyObject *PyWideString_InternFromString Py_PROTO((const char *)); +#else +#define PyWideString_InternInPlace(p) +#define PyWideString_InternFromString(cp) PyWideString_FromString(cp) +#endif + +/* Macro, trading safety for speed */ +#define PyWideString_AS_STRING(op) (((PyWideString *)(op))->ob_sval) +#define PyWideString_GET_SIZE(op) (((PyWideString *)(op))->ob_size) + +#ifdef __cplusplus +} +#endif +#endif /* !Py_WSTRINGOBJECT_H */ +#ifndef Py_WSTRINGOBJECT_H +#define Py_WSTRINGOBJECT_H + +#ifdef __cplusplus +extern "C" { +#endif + + /* For now we'll use 16-bit ints for wide characters */ +typedef unsigned short PyWideChar; + +typedef struct{ + PyObject_VAR_HEAD +#ifdef CACHE_HASH + long ob_shash; +#endif +#ifdef INTERN_STRINGS + PyObject *ob_sinterned; +#endif + PyWideChar string[1]; +} PyWideString; + +extern DL_IMPORT(PyTypeObject) PyWideString_Type; + +#define PyWideString_Check(v) ((v)->ob_type == &PyWideString_Type) + +extern PyObject *PyWideString_FromStringAndSize Py_PROTO((const char *, int)); +extern PyObject *PyWideString_FromString Py_PROTO((const char *)); +extern PyObject *PyWideString_FromWideCharArray Py_PROTO((PyWideChar *, int)); +extern int PyWideString_Size Py_PROTO((PyObject *)); +extern PyWideChar *PyWideString_AsString Py_PROTO((PyObject *)); +extern void PyWideString_Concat Py_PROTO((PyObject **, PyObject *)); +extern void PyWideString_ConcatAndDel Py_PROTO((PyObject **, PyObject *)); +extern int _PyWideString_Resize Py_PROTO((PyObject **, int)); +extern PyObject *PyWideString_Format Py_PROTO((PyObject *, PyObject *)); +PyObject *PyWideString_To8bitString Py_PROTO((PyWideString *op)); + +#ifdef INTERN_STRINGS +extern void PyWideString_InternInPlace Py_PROTO((PyObject **)); +extern PyObject *PyWideString_InternFromString Py_PROTO((const char *)); +#else +#define PyWideString_InternInPlace(p) +#define PyWideString_InternFromString(cp) PyWideString_FromString(cp) +#endif + +/* Macro, trading safety for speed */ +#define PyWideString_AS_STRING(op) (((PyWideString *)(op))->ob_sval) +#define PyWideString_GET_SIZE(op) (((PyWideString *)(op))->ob_size) + +#ifdef __cplusplus +} +#endif +#endif /* !Py_WSTRINGOBJECT_H */ diff -urN -X /usr/src/excl.diff Python-1.5.orig/Lib/string.py Python-1.5/Lib/string.py --- Python-1.5.orig/Lib/string.py Mon Dec 29 20:55:49 1997 +++ Python-1.5/Lib/string.py Sun Apr 19 09:38:25 1998 @@ -349,14 +349,14 @@ sign = s[0] s = s[1:] if not s: - raise ValueError, 'non-float argument to string.atof' + raise ValueError, 'non-float argument to string.atof not s' while s[0] == '0' and len(s) > 1 and s[1] in digits: s = s[1:] - if re and not re.match('[0-9]*(\.[0-9]*)?([eE][-+]?[0-9]+)?$', s): - raise ValueError, 'non-float argument to string.atof' + if re and not re.match('[0-9]*(\.[0-9]*)?([eE][-+]?[0-9]+)?\s*$', s): + raise ValueError, 'non-float argument to string.atof re' try: return float(eval(sign + s, safe_env)) except SyntaxError: - raise ValueError, 'non-float argument to string.atof' + raise ValueError, 'non-float argument to string.atof synt' # Convert string to integer def atoi(str, base=10): @@ -374,7 +374,7 @@ # We only get here if strop doesn't define atoi() raise ValueError, "this string.atoi doesn't support base != 10" sign = '' - s = str + s = strip(str) if s and s[0] in '+-': sign = s[0] s = s[1:] @@ -403,7 +403,7 @@ # We only get here if strop doesn't define atol() raise ValueError, "this string.atol doesn't support base != 10" sign = '' - s = str + s = strip(str) if s and s[0] in '+-': sign = s[0] s = s[1:] diff -urN -X /usr/src/excl.diff Python-1.5.orig/Lib/test/test_wide.py Python-1.5/Lib/test/test_wide.py --- Python-1.5.orig/Lib/test/test_wide.py Thu Jan 1 01:00:00 1970 +++ Python-1.5/Lib/test/test_wide.py Wed Apr 8 07:41:36 1998 @@ -0,0 +1,188 @@ +# Below is Lib/test/test_strop.py, slightly modified + +from test_support import verbose +import string, sys + +def test2(name, input, output, args): + if verbose: + print 'string.%s%s =? %s... ' % (name, (input,) + args, output), + f = getattr(string, name) + try: + value = apply(f, (input,) + args) + except: + value = sys.exc_type + if value != output: + if verbose: + print 'no' + print value == output, f, `input`, `output`, `value` + else: + if verbose: + print 'yes' + +def test(name, input, output, *args): + test2(name, input, output, args) + if type(input) == type(""): + test2(name, unicode(input), output, args) + +test('atoi', " 1 ", 1) +test('atoi', " 1x", ValueError) +test('atoi', " x1 ", ValueError) +test('atol', " 1 ", 1L) +test('atol', " 1x ", ValueError) +test('atol', " x1 ", ValueError) +test('atof', " 1 ", 1.0) +test('atof', " 1x ", ValueError) +test('atof', " x1 ", ValueError) + +test('capitalize', ' hello ', ' hello ') +test('capitalize', 'hello ', 'Hello ') +test('find', 'abcdefghiabc', 0, 'abc') +test('find', 'abcdefghiabc', 9, 'abc', 1) +test('find', 'abcdefghiabc', -1, 'def', 4) +test('rfind', 'abcdefghiabc', 9, 'abc') +test('lower', 'HeLLo', 'hello') +test('upper', 'HeLLo', 'HELLO') + +transtable = '\000\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037 !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`xyzdefghijklmnopqrstuvwxyz{|}~\177\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377' + +test('maketrans', 'abc', transtable, 'xyz') +test('maketrans', 'abc', ValueError, 'xyzq') + +test('split', 'this is the split function', + ['this', 'is', 'the', 'split', 'function']) +test('split', 'a|b|c|d', ['a', 'b', 'c', 'd'], '|') +test('split', 'a|b|c|d', ['a', 'b', 'c|d'], '|', 2) +test('split', 'a b c d', ['a', 'b c d'], None, 1) +test('split', 'a b c d', ['a', 'b', 'c d'], None, 2) +test('split', 'a b c d', ['a', 'b', 'c', 'd'], None, 3) +test('split', 'a b c d', ['a', 'b', 'c', 'd'], None, 4) +test('split', 'a b c d', ['a', 'b', 'c', 'd'], None, 0) +test('split', 'a b c d', ['a', 'b', 'c d'], None, 2) + +# join now works with any sequence type +class Sequence: + def __init__(self): self.seq = 'wxyz' + def __len__(self): return len(self.seq) + def __getitem__(self, i): return self.seq[i] + +test('join', ['a', 'b', 'c', 'd'], 'a b c d') +test('join', ('a', 'b', 'c', 'd'), 'abcd', '') +test('join', Sequence(), 'w x y z') + +# try a few long ones +print string.join(['x' * 100] * 100, ':') +print string.join(('x' * 100,) * 100, ':') + +test('strip', ' hello ', 'hello') +test('lstrip', ' hello ', 'hello ') +test('rstrip', ' hello ', ' hello') + +test('swapcase', 'HeLLo cOmpUteRs', 'hEllO CoMPuTErS') +test('translate', 'xyzabcdef', 'xyzxyz', transtable, 'def') + +test('replace', 'one!two!three!', 'one@two!three!', '!', '@', 1) +test('replace', 'one!two!three!', 'one@two@three!', '!', '@', 2) +test('replace', 'one!two!three!', 'one@two@three@', '!', '@', 3) +test('replace', 'one!two!three!', 'one@two@three@', '!', '@', 4) +test('replace', 'one!two!three!', 'one@two@three@', '!', '@', 0) +test('replace', 'one!two!three!', 'one@two@three@', '!', '@') +test('replace', 'one!two!three!', 'one!two!three!', 'x', '@') +test('replace', 'one!two!three!', 'one!two!three!', 'x', '@', 2) + +string.whitespace +string.lowercase +string.uppercase +# Below is Lib/test/test_strop.py, slightly modified + +from test_support import verbose +import string, sys + +def test2(name, input, output, args): + if verbose: + print 'string.%s%s =? %s... ' % (name, (input,) + args, output), + f = getattr(string, name) + try: + value = apply(f, (input,) + args) + except: + value = sys.exc_type + if value != output: + if verbose: + print 'no' + print value == output, f, `input`, `output`, `value` + else: + if verbose: + print 'yes' + +def test(name, input, output, *args): + test2(name, input, output, args) + if type(input) == type(""): + test2(name, unicode(input), output, args) + +test('atoi', " 1 ", 1) +test('atoi', " 1x", ValueError) +test('atoi', " x1 ", ValueError) +test('atol', " 1 ", 1L) +test('atol', " 1x ", ValueError) +test('atol', " x1 ", ValueError) +test('atof', " 1 ", 1.0) +test('atof', " 1x ", ValueError) +test('atof', " x1 ", ValueError) + +test('capitalize', ' hello ', ' hello ') +test('capitalize', 'hello ', 'Hello ') +test('find', 'abcdefghiabc', 0, 'abc') +test('find', 'abcdefghiabc', 9, 'abc', 1) +test('find', 'abcdefghiabc', -1, 'def', 4) +test('rfind', 'abcdefghiabc', 9, 'abc') +test('lower', 'HeLLo', 'hello') +test('upper', 'HeLLo', 'HELLO') + +transtable = '\000\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037 !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`xyzdefghijklmnopqrstuvwxyz{|}~\177\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377' + +test('maketrans', 'abc', transtable, 'xyz') +test('maketrans', 'abc', ValueError, 'xyzq') + +test('split', 'this is the split function', + ['this', 'is', 'the', 'split', 'function']) +test('split', 'a|b|c|d', ['a', 'b', 'c', 'd'], '|') +test('split', 'a|b|c|d', ['a', 'b', 'c|d'], '|', 2) +test('split', 'a b c d', ['a', 'b c d'], None, 1) +test('split', 'a b c d', ['a', 'b', 'c d'], None, 2) +test('split', 'a b c d', ['a', 'b', 'c', 'd'], None, 3) +test('split', 'a b c d', ['a', 'b', 'c', 'd'], None, 4) +test('split', 'a b c d', ['a', 'b', 'c', 'd'], None, 0) +test('split', 'a b c d', ['a', 'b', 'c d'], None, 2) + +# join now works with any sequence type +class Sequence: + def __init__(self): self.seq = 'wxyz' + def __len__(self): return len(self.seq) + def __getitem__(self, i): return self.seq[i] + +test('join', ['a', 'b', 'c', 'd'], 'a b c d') +test('join', ('a', 'b', 'c', 'd'), 'abcd', '') +test('join', Sequence(), 'w x y z') + +# try a few long ones +print string.join(['x' * 100] * 100, ':') +print string.join(('x' * 100,) * 100, ':') + +test('strip', ' hello ', 'hello') +test('lstrip', ' hello ', 'hello ') +test('rstrip', ' hello ', ' hello') + +test('swapcase', 'HeLLo cOmpUteRs', 'hEllO CoMPuTErS') +test('translate', 'xyzabcdef', 'xyzxyz', transtable, 'def') + +test('replace', 'one!two!three!', 'one@two!three!', '!', '@', 1) +test('replace', 'one!two!three!', 'one@two@three!', '!', '@', 2) +test('replace', 'one!two!three!', 'one@two@three@', '!', '@', 3) +test('replace', 'one!two!three!', 'one@two@three@', '!', '@', 4) +test('replace', 'one!two!three!', 'one@two@three@', '!', '@', 0) +test('replace', 'one!two!three!', 'one@two@three@', '!', '@') +test('replace', 'one!two!three!', 'one!two!three!', 'x', '@') +test('replace', 'one!two!three!', 'one!two!three!', 'x', '@', 2) + +string.whitespace +string.lowercase +string.uppercase diff -urN -X /usr/src/excl.diff Python-1.5.orig/Modules/_tkinter.c Python-1.5/Modules/_tkinter.c --- Python-1.5.orig/Modules/_tkinter.c Fri Oct 10 18:39:19 1997 +++ Python-1.5/Modules/_tkinter.c Sat Apr 18 17:49:28 1998 @@ -169,6 +169,40 @@ } } +static Tcl_Obj* +AsObj(value, tmp) + PyObject *value; + PyObject *tmp; +{ + Tcl_Obj *result; + if (PyWideString_Check(value)) { + PyObject *v = PyWideString_ToUtf8((PyWideString*)value); + result = AsObj(v,tmp); + Py_DECREF(v); + return result; + } + else if (PyString_Check(value)) + return Tcl_NewStringObj(PyString_AsString(value), + PyString_Size(value)); + else if (PyTuple_Check(value)) { + Tcl_Obj **argv = ckalloc(PyTuple_Size(value)*sizeof(Tcl_Obj*)); + int i; + if(!argv) + return 0; + for(i=0;i str\n\ +Convert str into the UTF-8 encoding."; + +static PyObject* +strop_utf8(self, args) + PyObject *self; /* Not used */ + PyObject *args; +{ + PyObject *o; + + if (!PyArg_ParseTuple(args, "O:utf8", &o)) + return NULL; + + if (PyWideString_Check(o)) + return PyWideString_ToUtf8(o); + if (PyString_Check(o)) + return PyString_ToUtf8(o); + PyErr_SetString(PyExc_ValueError,"utf8 conversion"); + return NULL; +} + +static char fromutf8__doc__[]= +"fromutf8(str) -> [w]str\n\ +Convert UTF-8 encoded str into LATIN-1 string if possible,\n\ +and into unicode string otherwise.\n"; + +static PyObject* +strop_fromutf8(self, args) + PyObject *self; /* Not used */ + PyObject *args; +{ + PyObject *o; + + if (!PyArg_ParseTuple(args, "O!:fromutf8", &PyString_Type, &o)) + return NULL; + return PyWideString_Utf8TryLatin1(o); +} /* List of functions defined in the module */ @@ -1116,6 +1156,7 @@ {"atol", strop_atol, 1, atol__doc__}, {"capitalize", strop_capitalize, 0, capitalize__doc__}, {"find", strop_find, 1, find__doc__}, + {"fromutf8", strop_fromutf8, 1, fromutf8__doc__}, {"join", strop_joinfields, 1, joinfields__doc__}, {"joinfields", strop_joinfields, 1, joinfields__doc__}, {"lstrip", strop_lstrip, 0, lstrip__doc__}, @@ -1130,6 +1171,7 @@ {"swapcase", strop_swapcase, 0, swapcase__doc__}, {"translate", strop_translate, 1, translate__doc__}, {"upper", strop_upper, 0, upper__doc__}, + {"utf8", strop_utf8, 1, utf8__doc__}, {NULL, NULL} /* sentinel */ }; diff -urN -X /usr/src/excl.diff Python-1.5.orig/Objects/Makefile.in Python-1.5/Objects/Makefile.in --- Python-1.5.orig/Objects/Makefile.in Sat Jul 19 21:39:29 1997 +++ Python-1.5/Objects/Makefile.in Wed Apr 8 07:41:36 1998 @@ -35,7 +35,8 @@ longobject.o dictobject.o methodobject.o \ moduleobject.o object.o rangeobject.o \ sliceobject.o stringobject.o \ - tupleobject.o typeobject.o + tupleobject.o typeobject.o \ + wstringobject.o SRCS= abstract.c \ classobject.c cobject.c complexobject.c \ @@ -44,7 +45,8 @@ longobject.c dictobject.c methodobject.c \ moduleobject.c object.c rangeobject.c \ sliceobject.c stringobject.c \ - tupleobject.c typeobject.c + tupleobject.c typeobject.c \ + wstringobject.c LIBRARY= ../libpython$(VERSION).a @@ -93,6 +95,7 @@ stringobject.o: stringobject.c tupleobject.o: tupleobject.c typeobject.o: typeobject.c +wstringobject.o: wstringobject.c # DO NOT DELETE THIS LINE -- mkdep uses it. # DO NOT PUT ANYTHING AFTER THIS LINE, IT WILL GO AWAY. diff -urN -X /usr/src/excl.diff Python-1.5.orig/Objects/fileobject.c Python-1.5/Objects/fileobject.c --- Python-1.5.orig/Objects/fileobject.c Fri Nov 7 20:20:34 1997 +++ Python-1.5/Objects/fileobject.c Wed Apr 8 07:41:36 1998 @@ -781,12 +781,42 @@ PyFileObject *f; PyObject *args; { + PyObject *obj; char *s; int n, n2; + PyBufferProcs *pb; + if (f->f_fp == NULL) return err_closed(); - if (!PyArg_Parse(args, "s#", &s, &n)) + if (!PyArg_Parse(args, "O", &obj)) return NULL; + pb = obj->ob_type->tp_as_buffer; + if (!PyString_Check(obj) && (pb == NULL || + pb->bf_getreadbuffer == NULL || + pb->bf_getsegcount == NULL) ) { + PyErr_SetString(PyExc_TypeError, "write() requires a string"); + return NULL; + } + if (PyWideString_Check(obj) && PyWideString_To8bitString(obj) == NULL) + return NULL; + if (PyString_Check(obj)) { + s = PyString_AsString(obj); + n = PyString_Size(obj); + } + else /* must use buffer interface */ + { + if ( (*pb->bf_getsegcount)(obj, NULL) != 1 ) { + PyErr_SetString(PyExc_TypeError, + "write() requires single-segment buffer" ); + return NULL; + } + if ( (n = + (*pb->bf_getreadbuffer)(obj, 0, &s)) < 0 ) { + /* I assume getreadbuffer will have set an exception? */ + return NULL; + } + } + f->f_softspace = 0; Py_BEGIN_ALLOW_THREADS errno = 0; diff -urN -X /usr/src/excl.diff Python-1.5.orig/Objects/object.c Python-1.5/Objects/object.c --- Python-1.5.orig/Objects/object.c Wed Nov 19 17:03:17 1997 +++ Python-1.5/Objects/object.c Wed Apr 8 07:41:36 1998 @@ -297,6 +297,26 @@ Py_DECREF(res); return (c < 0) ? -1 : (c > 0) ? 1 : 0; } + if ( (v->ob_type == &PyWideString_Type || + w->ob_type == &PyWideString_Type) && + PyString_Check(v) && PyString_Check(w) ) + { + int neg = 1; + if (PyWideString_Type.tp_compare == NULL) + return (v < w) ? -1 : 1; + + /* We must ensure that the first argument is a wide + string; the second argument can be either a regular or + wide string. If required, we swap the arguments and + negate the result */ + if (v->ob_type != &PyWideString_Type) + { + PyObject *tmp; + tmp = v; v = w; w = tmp; + } + return neg * (*PyWideString_Type.tp_compare)(v, w); + } + if ((tp = v->ob_type) != w->ob_type) { if (tp->tp_as_number != NULL && w->ob_type->tp_as_number != NULL) { diff -urN -X /usr/src/excl.diff Python-1.5.orig/Objects/stringobject.c Python-1.5/Objects/stringobject.c --- Python-1.5.orig/Objects/stringobject.c Mon Sep 8 20:30:11 1997 +++ Python-1.5/Objects/stringobject.c Wed Apr 8 07:41:36 1998 @@ -180,6 +180,9 @@ register PyObject *op; { if (!PyString_Check(op)) { + if (PyWideString_Check(op)) { + return sizeof(PyWideChar) * PyWideString_Size(op); + } PyErr_BadInternalCall(); return -1; } @@ -191,6 +194,9 @@ register PyObject *op; { if (!PyString_Check(op)) { + if (PyWideString_Check(op)) { + return (char *)PyWideString_AsString(op); + } PyErr_BadInternalCall(); return NULL; } @@ -249,7 +255,7 @@ register char *p; int quote; - /* figure out which quote to use; single is prefered */ + /* figure out which quote to use; single is preferred */ quote = '\''; if (strchr(op->ob_sval, '\'') && !strchr(op->ob_sval, '"')) quote = '"'; @@ -294,6 +300,12 @@ PyErr_BadArgument(); return NULL; } + /* If either string is wide, call the wide string version */ + if (PyWideString_Check(a) || PyWideString_Check(bb)) { + return (PyWideString_Type.tp_as_sequence->sq_concat) + ( (PyObject *)a, (PyObject *)bb); + } + #define b ((PyStringObject *)bb) /* Optimize cases with empty left or right operand */ if (a->ob_size == 0) { @@ -479,7 +491,7 @@ const void **ptr; { PyErr_SetString(PyExc_TypeError, - "Cannot use string as modifyable buffer"); + "Cannot use string as modifiable buffer"); return -1; } diff -urN -X /usr/src/excl.diff Python-1.5.orig/Objects/wstringobject.c Python-1.5/Objects/wstringobject.c --- Python-1.5.orig/Objects/wstringobject.c Thu Jan 1 01:00:00 1970 +++ Python-1.5/Objects/wstringobject.c Sun Apr 19 09:28:57 1998 @@ -0,0 +1,1695 @@ +/*********************************************************** +Copyright (C) 1997 Martin von Löwis + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies. + +This software comes with no warranty. Use at your own risk. +******************************************************************/ + +/* Liberally hacked around by A.M. Kuchling (akuchling@acm.org) in an + effort to prototype Unicode support for Python */ + +#include +#include +#ifdef HAVE_WCHAR +#include +#else +/*#define unsigned int wchar_t; */ /* Should that be a typedef? */ +#endif +#include "Python.h" + +static char PyWStrop_Doc[]= +"Support for wide character strings\n" +"This module defines a new type wstring, which represents strings as\n" +"defined in ISO 10646 and Unicode. Some of it is in C, some in Python,\n" +"and some comes from the underlying C library, so the level of support\n" +"can differ among platforms.\n" +"Wide strings should behave like normal strings for most operations.\n" +"If they don't, there should be replacement operations. If there aren't\n" +"report it as a bug.\n" +"Many functions deal with the conversion from and to various character\n" +"sets and encodings. Those functions take an optional flags argument,\n" +"which is a combination of the following values:\n" +"SKIP_INVALID Illegal characters should be treated gracefully\n" +" rather than raising ConvertError\n" +"UTF7_QUOTE_OPTIONALS Characters not legal in some places of an RFC 822\n" +" message should be quoted in utf7()\n" +"UCS_SWITCH_BYTEORDER The UCS-2 and -4 strings should be considered\n" +" little endian.\n" +"UCS2_DO_UTF16 When processing UCS2 (e.g. inside UTF7), honor\n" +" cells reserved for UTF-16."; + + +#define SKIP_INVALID 1 +#define UTF7_QUOTE_OPTIONALS 2 +#define UCS_SWITCH_BYTEORDER 4 +#define UCS2_DO_UTF16 8 + +static PyObject* ConvertError; + +static PyObject* Encodings; +static PyObject* Decodings; +static PyObject* EncodingFunctions; +static PyObject* DecodingFunctions; +static PyObject* Aliases; + +static PyObject* PyWideString_Slice(PyWideString* a,int i,int j); + +/* allocates new WString. Possible optimization: share empty strings */ +static PyWideString* +PyWideString_New(int len) +{ + PyWideString *wstr; + /* this gives len+1 PyWideChar elements */ + wstr = (PyWideString*)malloc(sizeof(PyWideString)+len*sizeof(PyWideChar)); + if(!wstr) + return (PyWideString*)PyErr_NoMemory(); /* return value is 0, anyway */ + wstr->ob_type = &PyWideString_Type; + wstr->ob_size = len; + /* always zero-terminated */ + wstr->string[len]=0; +#ifdef CACHE_HASH + wstr->ob_shash = -1; +#endif +#ifdef INTERN_STRINGS + wstr->ob_sinterned = NULL; +#endif + _Py_NewReference(wstr); + return wstr; +} + +/* deletes the string */ +static void +PyWideString_Free(PyWideString *self) +{ + PyMem_DEL(self); +} + +/* AMK: Added missing functions from the string API */ + +PyObject *PyWideString_FromStringAndSize(str, size) + const char *str; int size; +{ + PyWideString *wstr; + int i; + + wstr = PyWideString_New(size); + if (wstr == NULL) {return NULL;} + + for(i=0; istring[i] = *(unsigned char*)(str+i); + } + + return (PyObject *)wstr; +} + +PyObject *PyWideString_FromString(str) + const char *str; +{ + return PyWideString_FromStringAndSize(str, strlen(str)); +} + +int PyWideString_Size(op) + register PyObject *op; +{ + if (!PyWideString_Check(op)) { + PyErr_BadInternalCall(); + return -1; + } + + return ((PyWideString *)op) -> ob_size; +} + +PyObject *PyWideString_FromWideCharArray(array, size) + PyWideChar *array; int size; +{ + PyWideString *wstr; + int i; + + wstr = PyWideString_New(size); + if (wstr == NULL) {return NULL;} + + for(i=0; istring[i] = array[i]; + } + + return (PyObject *)wstr; +} + +PyWideChar *PyWideString_AsString(op) + register PyObject *op; +{ + if (!PyWideString_Check(op)) { + PyErr_BadInternalCall(); + return NULL; + } + return ((PyWideString *)op) -> string; +} + +void PyWideString_Concat(pv, w) + register PyObject **pv; + register PyObject *w; +{ + PyErr_SetString(PyExc_SystemError, "unimplemented wide string function called (Concat)"); +} + +void PyWideString_ConcatAndDel(pv, w) + register PyObject **pv; + register PyObject *w; +{ + PyErr_SetString(PyExc_SystemError, "unimplemented wide string function called (ConcatAndDel)"); +} + +int _PyWideString_Resize(pv, newsize) + PyObject **pv; + int newsize; +{ + PyErr_SetString(PyExc_SystemError, "unimplemented wide string function called (resize)"); + return -1; +} + +PyObject *PyWideString_Format(format, args) + PyObject *format; + PyObject *args; +{ + PyErr_SetString(PyExc_SystemError, "unimplemented wide string function called (format)"); + return NULL; +} + +#if 0 /* This function does the wrong thing, and I'm confused about what + the right thing is. */ +static PyObject * +PyWideString_Print(op, fp, flags) + PyWideString *op; + FILE *fp; + int flags; +{ + int i; + char c; + int quote; + + /* XXX Ought to check for interrupts when writing long strings */ + if (flags & Py_PRINT_RAW) { + for(i=0; iob_size; i++) + { + c = op->string[i] / 256; + fwrite(&c, 1, 1, fp); + c = op->string[i] % 256; + fwrite(&c, 1, 1, fp); + } + return 0; + } + + /* figure out which quote to use; single is preferred */ + quote = '\''; + for(i=0; iob_size; i++) + { + if ( op->string[i] / 256 == '"' || op->string[i] % 256 == '"') + {quote = '\''; break;} + if ( op->string[i] / 256 == '\'' || op->string[i] % 256 == '\'') + {quote = '"';} + } + + fputc(quote, fp); + for (i = 0; i < op->ob_size; i++) { + c = op->string[i] / 256; + if (c == quote || c == '\\') + fprintf(fp, "\\%c", c); + else if (c < ' ' || c >= 0177) + fprintf(fp, "\\%03o", c & 0377); + else + fputc(c, fp); + + c = op->string[i] % 256; + if (c == quote || c == '\\') + fprintf(fp, "\\%c", c); + else if (c < ' ' || c >= 0177) + fprintf(fp, "\\%03o", c & 0377); + else + fputc(c, fp); + } + fputc(quote, fp); + return 0; +} +#endif + +/* This is a tricky function. It attempts to convert a wide string to + an 8-bit string, *in place*. + Sets an exception and returns NULL if the conversion can't be done + because of characters >256. + Right now this just drops the high 8 bits of each character; should + it be modified to use different encodings? + */ + +PyObject *PyWideString_To8bitString(op) + PyWideString *op; +{ + int i; + unsigned char *c; + PyStringObject *as_str; + + for(i = 0; i < op->ob_size; i++) + { + if (255 < op->string[i]) + { + PyErr_SetString(PyExc_ValueError, "wide string cannot be converted to 8-bit string"); + if (Py_VerboseFlag) + { + fprintf(stderr, "Wide -> 8 bit conversion attempt failed\n"); + } + return NULL; + } + } + + if (Py_VerboseFlag) + { + fprintf(stderr, "Folding wide string to 8-bit string\n"); + } + + /* XXX Evil casting skulduggery here; how portable & safe is it? */ + /* Needs to be modified if you add new fields to the structures + representing either strings or wide strings */ + as_str = (PyStringObject *)op; + c = (unsigned char *) as_str->ob_sval; + for(i = 0; i < op->ob_size; i++) + {c[i] = (unsigned char) op->string[i];} + c[i] = '\0'; /* Null-terminate the string */ + op->ob_type = &PyString_Type; + + /* XXX the cached hash (say it three times fast) should still be OK, + but what about interned strings? */ + return (PyObject *)as_str; +} + +/* AMK: End of added API functions */ + +/* Converts a single wide character to a sequence of utf8 bytes. + Returns the number of bytes, or 0 on error. */ +static int +to_utf8(PyWideChar c,unsigned char* buf) +{ + if(c<0x80){ + if(buf)buf[0]=c; + return 1; + } + if(c<0x800){ + if(buf){ + buf[0] = 0xc0 | (c>>6); + buf[1] = 0x80 | (c & 0x3f); + } + return 2; + } + if(c<0x10000){ + if(buf){ + buf[0] = 0xe0 | (c>>12); + buf[1] = 0x80 | ((c>>6) & 0x3f); + buf[2] = 0x80 | (c & 0x3f); + } + return 3; + } + if(c<0x200000){ + if(buf){ + buf[0] = 0xf0 | (c>>18); + buf[1] = 0x80 | ((c>>12) & 0x3f); + buf[2] = 0x80 | ((c>>6) & 0x3f); + buf[3] = 0x80 | (c & 0x3f); + } + return 4; + } + if(c<0x4000000){ + if(buf){ + buf[0] = 0xf8 | (c>>24); + buf[1] = 0x80 | ((c>>18) & 0x3f); + buf[2] = 0x80 | ((c>>12) & 0x3f); + buf[3] = 0x80 | ((c>>6) & 0x3f); + buf[4] = 0x80 | (c & 0x3f); + } + return 5; + } + if(c<0x8000000U){ + if(buf){ + buf[0] = 0xfc | (c>>30); + buf[1] = 0x80 | ((c>>24) & 0x3f); + buf[2] = 0x80 | ((c>>18) & 0x3f); + buf[3] = 0x80 | ((c>>12) & 0x3f); + buf[4] = 0x80 | ((c>>6) & 0x3f); + buf[5] = 0x80 | (c & 0x3f); + } + return 6; + } + + /* not encodable */ + return 0; +} + +/* Decodes a sequence of utf8 bytes into a single wide character. + Returns the number of bytes consumed, or 0 on error */ +static int +from_utf8(const unsigned char* str,PyWideChar *c) +{ + int l=0,i; + if(*str<0x80){ + *c = *str; + return 1; + } + if(*str<0xc0) /* lead byte must not be 10xxxxxx */ + return 0; /* is c0 a possible lead byte? */ + if(*str<0xe0){ /* 110xxxxx */ + *c = *str & 0x1f; + l=2; + }else if(*str<0xf0){ /* 1110xxxx */ + *c = *str & 0xf; + l=3; + }else if(*str<0xf8){ /* 11110xxx */ + *c = *str & 7; + l=4; + }else if(*str<0xfc){ /* 111110xx */ + *c = *str & 3; + l=5; + }else if(*str<0xfe){ /* 1111110x */ + *c = *str & 1; + l=6; + }else return 0; + + for(i=1;i='A' && c<='Z') + return 1; + if(c>='a' && c<='z') + return 1; + if(c>='0' && c<='9') + return 1; + if(strchr("'(),-./:?",c)) + return 1; + return 0; + case SET_O: + if(get_utf7_type(c,SET_D)) + return 1; + if(strchr("!\"#$%&*;<=>@[]^_`{|}",c)) + return 1; + return 0; + /* Set D plus characters that need not to be quoted according to Rule 3 */ + case SET_C: + if(get_utf7_type(c,SET_O)) + return 1; + if(strchr("\n\r\t ",c)) + return 1; + return 0; + case SET_B: + if(c>='A' && c<='Z') + return 1; + if(c>='a' && c<='z') + return 1; + if(c>='0' && c<='9') + return 1; + if(strchr("+/",c)) + return 1; + return 0; + } + return 0; +} + +/* Converts a natural in the range 0..63 to the + corresponding base64 character */ +static char +to_base64(int i) +{ + static char map[]= + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "0123456789+/"; + if(i<64) + return map[i]; + return ' '; +} + +/* Converts a base64 character to the corresponding integer */ +static int +from_base64(char c) +{ + if(c>='A' && c<='Z') + return c-'A'; + if(c>='a' && c<='z') + return c-'a'+26; + if(c>='0' && c<='9') + return c-'0'+52; + if(c=='+') + return 62; + if(c=='/') + return 63; + return -1; +} + +/* Converts a UCS-2 string to UTF-7. + Returns the length of the UTF-7 string, or -1 on error. + If the destination buffer is 0, it only counts the length. */ +static int +ucs2_to_utf7(char* utf7,unsigned char* ucs2,int len2,enum utf7_type optionals) +{ + int len7=0; + int carry=0,ccount=0; + int quoted=0; + int b64; +#define PUT7(c) do{if(utf7)*utf7++=c;len7++;}while(0) +#define FLUSH_CARRY \ + while(ccount>=6){ \ + b64=carry>>(ccount-6); \ + PUT7(to_base64(b64)); \ + ccount-=6; \ + carry &= (1<=16){ \ + PUT2((carry>>(ccount-8))&0xFF,(carry>>(ccount-16)) & 0xFF); \ + ccount-=16; \ + carry&=(1<8 + * The current test also detects +A- as ill-formed sequence + */ +#define UNQUOTE \ + if(!(flags & SKIP_INVALID) && ccount>6){ \ + PyErr_SetString(ConvertError, \ + "odd number of octets in quoted sequence"); \ + return -1; \ + } \ + carry=0;ccount=0;quoted=0; + while(len2){ + if(!quoted && *utf7!='+'){ + /* FIXME: check for invalid UTF-7 characters */ + PUT2(0,*utf7); + utf7++;len2--; + continue; + } + if(!quoted && *utf7=='+'){ + utf7++;len2--; + /* + must be followed either by - or a SET_B character */ + if((!(flags & SKIP_INVALID) && + (!len2 || (*utf7!='-' && !get_utf7_type(*utf7,SET_B))))){ + PyErr_SetString(ConvertError,"Invalid escape in UTF-7"); + return -1; + } + if(len2 && *utf7=='-'){ /* +- */ + PUT2(0,'+'); + utf7++;len2--; + }else + quoted=1; + continue; + } + /* handle base 64 string */ + if(!get_utf7_type(*utf7,SET_B)){ + /* not a base64 character, done */ + if(*utf7=='-'){ + /* final '-', skip it */ + utf7++;len2--; + } + FLUSH_CACHE; + /* check and clear remaining bits */ + UNQUOTE; + continue; + } + carry=(carry<<6)|from_base64(*utf7); + utf7++;len2--; + ccount+=6; + FLUSH_CACHE; + } + FLUSH_CACHE; + /* check and clear remaining bits */ + UNQUOTE; + return len; +} + +static char +tohex(int i) +{ + return "0123456789ABCDEF"[i]; +} + +static int +fromhex(char i) +{ + if(i>='0' && i<='9') + return i-'0'; + if(i>='a' && i<='f') + return i-'a'+10; + if(i>='A' && i<='F') + return i-'A'+10; + return -1; +} + +/* +static char PyWideString_FromQString[]= +"Creates a wide string from a uniquoted sttring." +*/ + +PyObject* +PyWideString_FromQString(PyObject* qs) +{ + char *string; + int len,newlen,i,k,d,lim; + PyWideChar v; + PyWideString* result; + + string = PyString_AsString(qs); + len = PyString_Size(qs); + + for(i=newlen=0; istring[newlen++]=v; + } + + return (PyObject*)result; +} + +PyObject* +PyWideString_ToQString(PyObject* self) +{ + int i, k, newlen; + PyWideString *old = (PyWideString*)self; + PyObject *result; + char *rstring; + + for(i=newlen=0;iob_size;i++) + if(old->string[i]<256) + newlen++; + else + newlen+=6; + + result = PyString_FromStringAndSize(0,newlen); + if(!result) + return NULL; + rstring = PyString_AsString(result); + + for(i=0;iob_size;i++) + if(old->string[i]<256) + *(rstring++) = old->string[i]; + else{ + *(rstring++) = '\\'; + *(rstring++) = 'u'; + k = old->string[i]; + /* XXX characters in other planes */ + *(rstring++) = tohex((k>>12) & 0xF); + *(rstring++) = tohex((k>>8) & 0xF); + *(rstring++) = tohex((k>>4) & 0xF); + *(rstring++) = tohex(k & 0xF); + } + return result; +} + +PyObject* +PyWideString_FromUtf8(PyObject* self) +{ + char *string; + char *tmp; + PyWideString *wstr = 0; + PyWideChar wtmp; + int len,i,l1,newlen; + + string = PyString_AsString(self); + len = PyString_Size(self); + for(i=0,newlen=0;istring+i); + return (PyObject*)wstr; +} + +PyObject* +PyWideString_Utf8TryLatin1(PyObject* self) +{ + unsigned char *string,*rstr; + int len,l1,i,newlen; + PyWideChar wtmp; + PyObject *result; + + string = PyString_AsString(self); + len = PyString_Size(self); + for(i=newlen=0;i=256) + return PyWideString_FromUtf8(self); + } + if(newlen==len){ + /* ASCII */ + Py_INCREF(self); + return self; + } + result=PyString_FromStringAndSize(NULL,newlen); + if(!result) + return NULL; + rstr = PyString_AsString(result); + for(i=newlen=0;istring[i] = (PyWideChar)string[i]; + } + return result; + } + + /* Dictionary mappings always use 8bit characters */ + val=PyDict_GetItemString(Decodings,type); + if(val && PyDict_Check(val)){ + PyWideString *result = PyWideString_New(len); + int i; + PyObject *I,*C; + if(!result)return 0; + for(i=0;istring[i]=string[i]; + continue; + } + PyErr_SetString(ConvertError,"Invalid character in source"); + PyWideString_Free(result); + return NULL; + } + result->string[i] = (PyWideChar)PyInt_AsLong(C); + } + return result; + } + + /* Mapping functions do everything */ + val=PyDict_GetItemString(DecodingFunctions,type); + if(val && PyCallable_Check(val)){ + PyWideString *result; + PyObject *args=Py_BuildValue("s#i",string,len,flags); + if(!args)return 0; + result=(PyWideString*)PyEval_CallObject(val,args); + Py_DECREF(args); + if(!PyWideString_Check(result)){ + PyErr_SetString(PyExc_TypeError,"conversion function did not return a wide string"); + return 0; + } + return result; + } + PyErr_SetString(ConvertError,"Unknown character set"); + return NULL; +} + +static char PyWideString_Encode_Doc[]= +"Encodes a wide string when given an encoding name.\n" +"The encoding must be an alias, a mapping table, a function or built-in.\n" +"Optional flags have encoding-dependent semantics.\n"; + +static PyObject* +PyWideString_Encode(PyWideString* self,PyObject *args) +{ + char *type; + int flags=0; + char *s; + PyObject *val; + if(!PyArg_ParseTuple(args,"s|i",&type,&flags))return 0; + /* Alias processing, allow one level of indirection */ + val=PyDict_GetItemString(Aliases,type); + if(val && PyString_Check(val)) + type=PyString_AsString(val); + /* 8859-1 is builtin */ + if(!strcmp(type,"ISO_8859-1:1987")){ + PyObject *result = PyString_FromStringAndSize(0,self->ob_size); + int i; + int skipped=0; + if(!result)return 0; + s=PyString_AsString(result); + for(i=0;iob_size;i++){ + if(self->string[i]>=256){ + if(flags && SKIP_INVALID){ + skipped++; + continue; + } + Py_DECREF(result); + PyErr_SetString(ConvertError,"Unconvertible wide character"); + return 0; + } + s[i-skipped] = (char)self->string[i]; + } + if(skipped){ + PyObject *s1=PySequence_GetSlice(result,0,self->ob_size-skipped); + Py_DECREF(result); + result=s1; + } + return result; + } + + /* Dictionary mappings always use 8bit characters */ + val=PyDict_GetItemString(Encodings,type); + if(val && PyDict_Check(val)){ + PyObject *result = PyString_FromStringAndSize(0,self->ob_size); + int i; + int skipped=0; + PyObject *I,*C; + if(!result)return 0; + s=PyString_AsString(result); + for(i=0;iob_size;i++){ + I=PyInt_FromLong(self->string[i]); + if(!I)return 0; + C=PyDict_GetItem(val,I);Py_DECREF(I); + if(!C){ + if(flags && SKIP_INVALID){ + skipped++; + continue; + } + PyErr_SetString(ConvertError,"Unconvertible wide character"); + Py_DECREF(result); + return NULL; + } + s[i-skipped] = (char)PyInt_AsLong(C); + } + if(skipped){ + PyObject *s1=PySequence_GetSlice(result,0,self->ob_size-skipped); + Py_DECREF(result); + result=s1; + } + return result; + } + + /* Mapping functions do everything */ + val=PyDict_GetItemString(EncodingFunctions,type); + if(val && PyCallable_Check(val)){ + PyObject *result; + PyObject *args=Py_BuildValue("Oi",self,flags); + if(!args)return 0; + result=PyEval_CallObject(val,args); + Py_DECREF(args); + if(!PyString_Check(result)){ + PyErr_SetString(PyExc_TypeError,"conversion function did not return a string"); + return 0; + } + return result; + } + + PyErr_SetString(ConvertError,"Unknown character set"); + return NULL; +} + +PyObject* +PyWideString_ToUtf8(PyObject* s) +{ + PyWideString *self=(PyWideString*)s; + PyObject *string; + int len,l1,i; + char *str; + + for(len=i=0;iob_size;i++){ + l1=to_utf8(self->string[i],0); + if(!l1){ + len=-1; + break; + } + len+=l1; + } + if(len==-1){ + PyErr_SetString(ConvertError,"Illegal 10646 character"); + return 0; + } + string = PyString_FromStringAndSize(0,len); + if(!string)return 0; + str=PyString_AsString(string); + for(i=0;iob_size;i++) + str+=to_utf8(self->string[i],str); + return string; +} + +PyObject* +PyString_ToUtf8(PyObject* self) +{ + PyObject *result; + unsigned char *str,*rstr; + int i,extra,len; + + str = PyString_AsString(self); + len = PyString_Size(self); + for(i=extra=0;i127) + extra++; + if(!extra) { + Py_INCREF(self); + return self; + } + result = PyString_FromStringAndSize(NULL,len+extra); + if(!result) + return NULL; + rstr = PyString_AsString(result); + for(i=extra=0;istring[i/2-skipped]=c; + continue; + } + /* check for UTF-16 support */ + if(flags & UCS2_DO_UTF16){ + PyWideChar next; + /* not in high-half zone: coding error*/ + if((c & 0xFC00) != 0xD800) + if(flags & SKIP_INVALID){ + skipped++; + continue; + }else{ + error="unpaired low-half UTF-16 cell"; + break; + } + i+=2; + if(i>=len){ + if(flags & SKIP_INVALID) + skipped++; + else + error="unpaired high-half UTF-16 cell"; + break; + } + next=(string[i]<<8)|(string[i+1]); + /* not in low-half zone: coding error */ + if((next & 0xFC00) != 0xDC00) + if(flags & SKIP_INVALID){ + skipped++;i-=2; /*process next again*/ + continue; + }else{ + error="unpaired high-half UTF-16 cell"; + break; + } + /* construct cell */ + skipped++; + result->string[i/2-skipped] = ((c & 0x3FF)<<10) | (next & 0x3FF); + continue; + } + /* no UTF-16 support requested */ + if(flags & SKIP_INVALID){ + skipped++; + continue; + } + error="UTF-16 cells in UCS-2 string"; + break; + } + if(error){ + Py_DECREF(result); + PyErr_SetString(ConvertError,error); + return 0; + } + if(skipped){ + PyObject *s=PyWideString_Slice(result,0,result->ob_size-skipped); + Py_DECREF(result); + result=(PyWideString*)s; + } + return result; +} + +static char PyWideString_ToUcs2_Doc[]= +"Returns the UCS-2 encoding for the wide string, or UTF-16 if.\n" +"UCS2_DO_UTF16 is given. Raises ConvertError if characters are not\n" +"in the supported range."; + +static PyObject* +PyWideString_ToUcs2(PyWideString* self,PyObject *args) +{ + PyObject *string; + char *s; + int i; + int flags=0; + int newlen,newindex; + PyWideChar tmp; + char* error=0; + + if(!PyArg_ParseTuple(args,"|i",&flags))return 0; + if(flags & UCS2_DO_UTF16){ + for(i=newlen=0;iob_size;i++){ + tmp=self->string[i]; + /* cells in the range supported by UTF-16 will need 4 bytes */ + newlen += (tmp>=0x10000 && tmp<0x110000) ? 2:1; + } + }else + newlen=self->ob_size; + string = PyString_FromStringAndSize(NULL,2*newlen); + if(!string) return 0; + s=PyString_AsString(string); + for(i=newindex=0;iob_size;i++){ + tmp=self->string[i]; + if(tmp<0x10000){ + s[newindex++] = (self->string[i]>>8) & 0xFF; + s[newindex++] = self->string[i] & 0xFF; + continue; + } + if(tmp<0x110000){ + if(flags & UCS2_DO_UTF16){ + int high=0xD800|(tmp>>10); + int low=0xDC00|(tmp & 0x3FF); + s[newindex++]=high >> 8; + s[newindex++]=high & 0xFF; + s[newindex++]=low >> 8; + s[newindex++]=low & 0xFF; + continue; + } + /* no UTF-16 support */ + if(flags & SKIP_INVALID) + continue; + error="character out of range for UCS-2, try UTF-16"; + break; + } + if(flags & SKIP_INVALID) + continue; + error=(flags & UCS2_DO_UTF16)? + "character out of range for UTF-16": + "character out of range for UCS-2"; + break; + } + if(error){ + Py_DECREF(string); + PyErr_SetString(ConvertError,error); + return 0; + } + if(2*newlen != newindex){ /* some cells were skipped */ + PyObject *s=PySequence_GetSlice(string,0,newindex); + Py_DECREF(string); + string=s; + } + return string; +} + +char PyWideString_FromUtf7_Doc[]= +"Creates a wide string from an UTF-7 encoding. Supports UTF-16\n" +"in the intermediate UCS-2 encoding if the flag says UCS2_DO_UTF16.\n" +"SKIP_INVALID in the flags avoids ConvertError being raised."; + +static PyWideString* +PyWideString_FromUtf7(PyObject*self,PyObject* args) +{ + unsigned char* string; + int len,len2; + int flags=0; + PyObject *ucs2; + PyWideString *result; + + if(!PyArg_ParseTuple(args,"s#|i",&string,&len,&flags))return 0; + len2=utf7_to_ucs2(0,string,len,flags); + if(len2<0)return 0; + ucs2=PyString_FromStringAndSize(0,2*len2); + if(!ucs2)return 0; + utf7_to_ucs2(PyString_AsString(ucs2),string,len,flags); + args=Py_BuildValue("Oi",ucs2,flags); + if(!args)return 0; + result=PyWideString_FromUcs2(0,args); + Py_DECREF(args); + return result; +} + +char PyWideString_ToUtf7_Doc[]= +"Returns the UTF-7 encoding for a wide string. Supports UTF-16\n" +"in the intermediate UCS-2 encoding if the flag says UCS2_DO_UTF16.\n" +"SKIP_INVALID in the flags avoids ConvertError being raised."; + +static PyObject* +PyWideString_ToUtf7(PyWideString* self,PyObject *args) +{ + PyObject *ucs2,*utf7; + int flags=0; + int len; + if(!PyArg_ParseTuple(args,"|i",&flags))return 0; + ucs2=PyWideString_ToUcs2(self,args); + if(!ucs2)return 0; + len=ucs2_to_utf7(0,PyString_AsString(ucs2),PyObject_Length(ucs2), + (flags & UTF7_QUOTE_OPTIONALS) ? SET_D : SET_C); + if(len<0){ + Py_DECREF(ucs2); + PyErr_SetString(ConvertError,"Invalid intermediate ucs2"); + return 0; + } + utf7=PyString_FromStringAndSize(NULL,len); + if(!utf7)return 0; + ucs2_to_utf7(PyString_AsString(utf7),PyString_AsString(ucs2), + PyObject_Length(ucs2), + (flags & UTF7_QUOTE_OPTIONALS) ? SET_D : SET_C); + Py_DECREF(ucs2); + return utf7; +} + +static char PyWideString_FromUcs4_Doc[]= +"Creates a wide string from an UCS-4 string.\n" +"Later versions will support byte swapping, if the string is not bigendian."; + +static PyWideString* +PyWideString_FromUcs4(PyWideString *self,PyObject *args) +{ + unsigned char *string; + int len,i; + int flags=0; + PyWideString *result; + if(!PyArg_ParseTuple(args,"s#|i",&string,&len,&flags))return 0; + if((len & 3) && !(flags & SKIP_INVALID)){ + PyErr_SetString(ConvertError,"Length of UCS-4 string not multiple of 4"); + return 0; + } + result=PyWideString_New(len/4); + if(!result)return 0; + for(i=0;i0xD8000 && tmp<0xE000){ + /* maybe we should consider UTF-16 processing in this case */ + PyErr_SetString(ConvertError,"Reserved UTF-16 cells in UCS-4 string"); + return 0; + } + result->string[i]=tmp; + } + return result; +} + +static char PyWideString_ToUcs4_Doc[]= +"Converts a wide string into UCS-4.\n" +"Later versions will support byte swapping on request."; + +static PyObject* +PyWideString_ToUcs4(PyWideString *self,PyObject* args) +{ + int flags=0; + PyObject* result; + unsigned char *s; + int i; + /* this flags are ignored, anyways */ + if(!PyArg_ParseTuple(args,"|i",&flags))return 0; + result=PyString_FromStringAndSize(0,4*self->ob_size); + if(!result)return 0; + s=PyString_AsString(result); + for(i=0;iob_size;i++){ + PyWideChar tmp=self->string[i]; + s[4*i]=tmp>>24; + s[4*i+1]=(tmp>>16) & 0xFF; + s[4*i+2]=(tmp>>8) & 0xFF; + s[4*i+3]=tmp & 0xFF; + } + return result; +} + +static char PyWideString_FromUtf16_Doc[]= +"Creates a wide string from UTF-16.\n" +"This is a wrapper to from_ucs2."; + +PyWideString * +PyWideString_FromUtf16(PyObject* self,PyObject *args) +{ + PyObject *string; + PyWideString *result; + int flags=0; + if(!PyArg_ParseTuple(args,"S|i",&string,&flags))return 0; + flags|=UCS2_DO_UTF16; + args=Py_BuildValue("Oi",string,flags); + if(!args)return 0; + result=PyWideString_FromUcs2(self,args); + Py_DECREF(args); + return result; +} + +static char PyWideString_ToUtf16_Doc[]= +"Returns the UTF-16 coding for the wide string.\n" +"This is a wrapper for to_ucs2."; + +PyObject * +PyWideString_ToUtf16(PyWideString* self,PyObject *args) +{ + PyObject *result; + int flags=0; + if(!PyArg_ParseTuple(args,"|i",&flags))return 0; + flags|=UCS2_DO_UTF16; + args=Py_BuildValue("(i)",flags); + if(!args)return 0; + result=PyWideString_ToUcs2(self,args); + Py_DECREF(args); + return result; +} + +/* return len(self) */ +static int +PyWideString_Length(PyWideString* self) +{ + return self->ob_size; +} + +/* XXX needs a rename! */ +/* return a+b */ +static PyObject* +widestring_concat(PyObject *av,PyObject *bv) +{ + PyWideString *result; + PyVarObject *a = (PyVarObject *) av, *b = (PyVarObject *)bv; + + if(!PyString_Check(a) || !PyString_Check(b)){ + PyErr_BadArgument(); + return 0; + } + /* optimize len(a)==0 and len(b)==0 */ + if(a->ob_size==0){ + Py_INCREF(b); + return (PyObject*)b; + } + if(b->ob_size==0){ + Py_INCREF(a); + return (PyObject*)a; + } + result=PyWideString_New(a->ob_size+b->ob_size); + if(!result)return NULL; + + if (PyWideString_Check(a)) + { + PyWideString *as_wstr = (PyWideString *)a; + memcpy(result->string, as_wstr->string, + as_wstr->ob_size*sizeof(PyWideChar)); + } + else + { + PyStringObject *as_str = (PyStringObject *)a; + int i; + for(i = 0; i < as_str->ob_size; i++) + { + result->string[i] = (PyWideChar) as_str->ob_sval[i]; + } + } + if (PyWideString_Check(b)) + { + PyWideString *as_wstr = (PyWideString *)b; + /* Add another wide string */ + memcpy(result->string+a->ob_size, as_wstr->string, + as_wstr->ob_size*sizeof(PyWideChar));} + else + { /* Add an ordinary string */ + PyStringObject *as_str = (PyStringObject *)b; + int i, pos = a->ob_size; + for(i = 0; i < as_str->ob_size; i++, pos++) + { + result->string[pos] = (PyWideChar) as_str->ob_sval[i]; + } + } + /* XXX the following line seems to be redundant, so AMK commented it out */ + /* result->ob_size=a->ob_size+b->ob_size; */ + /* New already set last field to 0 */ + return (PyObject*)result; +} + +/* return a*n */ +static PyObject* +PyWideString_Repeat(PyWideString* a,int n) +{ + int len,size,i; + PyWideString *result; + if(n<0)n=0; + len=a->ob_size*n; + if(len==a->ob_size){ + Py_INCREF(a); + return (PyObject*)a; + } + size=a->ob_size*sizeof(PyWideChar); + result=PyWideString_New(len); + if(!result)return 0; + for(i=0;istring))+i*size,a->string,size); + return (PyObject*)result; +} + +/* return a[i] */ +static PyObject* +PyWideString_Item(PyWideString* a,int i) +{ + PyWideString *result; + if(i<0 || i>=a->ob_size){ + PyErr_SetString(PyExc_IndexError,"wstring index out of range"); + return 0; + } + result=PyWideString_New(1); + if(!result)return 0; + result->string[0]=a->string[i]; + return (PyObject*)result; +} + + +/* return a[i:j] */ +static PyObject* +PyWideString_Slice(PyWideString* a,int i,int j) +{ + PyWideString *result; + if(i<0)i=0; + if(j<0)j=0; + if(j>a->ob_size)j=a->ob_size; + if(i==0 && j==a->ob_size){ + Py_INCREF(a); + return (PyObject*)a; + } + if(jstring,a->string+i,(j-i)*sizeof(PyWideChar)); + return (PyObject*)result; +} + +/* AMK: changed from wstring.L to unicode; still uses UTF-8 encoding, + which is now inconsistent */ + +/* return "unicode("+repr(a.isostring())+")" */ +static PyObject* +PyWideString_Repr(PyWideString* a) +{ + PyObject *result, *tmp; + + /* \u quoting of the wide string */ + tmp=PyWideString_ToQString((PyObject*)a); + if(!tmp)return 0; + /* character representation of it. This is necessary to quote quotes */ + result=PyObject_Repr(tmp); + Py_DECREF(tmp); + if(!result)return 0; + tmp=result; + /* surrounded by unicode() */ + result=PyString_FromStringAndSize(0,PyObject_Length(tmp)+9); + if(!result){ + Py_DECREF(tmp); + return 0; + } + sprintf(PyString_AsString(result),"unicode(%s)",PyString_AsString(tmp)); + return result; +} + +/* return aob_size,b_len=b->ob_size; + int min_len=(a_lenstring[i]string[i]) + return -1; + if(a->string[i]>as_wstr->string[i]) + return 1; + } + } + else if (PyString_Check(bv)) + { + PyStringObject *as_str = (PyStringObject *)b; + + for(i=0;istring[i] < (unsigned char)as_str->ob_sval[i]) + return -1; + if(a->string[i] > (unsigned char)as_str->ob_sval[i]) + return 1; + } + } + return a_lenob_shash != -1) + return a->ob_shash; +#ifdef INTERN_STRINGS + if (a->ob_sinterned != NULL) + return (a->ob_shash = + ((PyWideString *)(a->ob_sinterned))->ob_shash); +#endif +#endif + + len = a->ob_size; + p = a->string; + x = *p << 7; + while (--len >= 0) + x = (1000003*x) ^ *p++; + x ^= a->ob_size; + if (x == -1) + x = -2; +#ifdef CACHE_HASH + a->ob_shash = x; +#endif + return x; +} + +static char PyWideString_chr_Doc[]= +"Returns the one-character wide string at the given cell.\n"; + +static PyWideString* +PyWideString_chr(PyWideString* self,PyObject *args) +{ + int val; + PyWideString *wval; + if(!PyArg_ParseTuple(args,"l",&val))return 0; + if(val<0){ + PyErr_SetString(PyExc_ValueError,"negative wide character"); + return 0; + } + if(val>0xD800 && val<0xE000){ + PyErr_SetString(ConvertError,"zone reserved for UTF-16"); + return 0; + } + wval=PyWideString_New(1); + if(!wval)return 0; + wval->string[0]=val; + return wval; +} + +static char PyWideString_ord_Doc[]= +"Return the cell value (code point) for the given one-character string.\n"; +static PyObject* +PyWideString_ord(PyObject *self,PyObject *args) +{ + PyWideString *val; + if(!PyArg_ParseTuple(args,"O!",&PyWideString_Type,&val))return 0; + return PyInt_FromLong(val->string[0]); +} + +static struct PyMethodDef PyWideString_Methods[] = { + {"ucs2", (PyCFunction)PyWideString_ToUcs2, 1, PyWideString_ToUcs2_Doc}, + {"ucs4", (PyCFunction)PyWideString_ToUcs4, 1, PyWideString_ToUcs4_Doc}, + {"utf7", (PyCFunction)PyWideString_ToUtf7, 1, PyWideString_ToUtf7_Doc}, + {"utf16", (PyCFunction)PyWideString_ToUtf16, 1, PyWideString_ToUtf16_Doc}, + {"encode", (PyCFunction)PyWideString_Encode, 1, PyWideString_Encode_Doc}, + {NULL,NULL} +}; + +static PyObject * +PyWideString_GetAttr(PyWideString* o,char *name) +{ + return Py_FindMethod(PyWideString_Methods, (PyObject*)o, name); +} + +static int +wstring_buffer_getreadbuf(self, index, ptr) + PyWideString *self; + int index; + const void **ptr; +{ + if ( index != 0 ) { + PyErr_SetString(PyExc_SystemError, + "Accessing non-existent string segment"); + return -1; + } + *ptr = (void *)self->string; + return self->ob_size * sizeof(PyWideChar); +} + +static int +wstring_buffer_getwritebuf(self, index, ptr) + PyWideString *self; + int index; + const void **ptr; +{ + PyErr_SetString(PyExc_TypeError, + "Cannot use wide string as modifiable buffer"); + return -1; +} + +static int +wstring_buffer_getsegcount(self, lenp) + PyWideString *self; + int *lenp; +{ + if ( lenp ) + *lenp = self->ob_size; + return 1; +} + +statichere PySequenceMethods PyWideString_AsSequence ={ + (inquiry)PyWideString_Length, /*sq_length*/ + (binaryfunc)widestring_concat, /*sq_concat*/ + (intargfunc)PyWideString_Repeat, /* sq_repeat*/ + (intargfunc)PyWideString_Item, /*sq_item*/ + (intintargfunc)PyWideString_Slice, /*sq_slice*/ + (intobjargproc)0, /*sq_ass_item*/ + (intintobjargproc)0 /*sq_ass_slice*/ +}; + +static PyBufferProcs wstring_as_buffer = { + (getreadbufferproc)wstring_buffer_getreadbuf, + (getwritebufferproc)wstring_buffer_getwritebuf, + (getsegcountproc)wstring_buffer_getsegcount, +}; + +PyTypeObject PyWideString_Type = { + PyObject_HEAD_INIT(&PyType_Type) + 0, /*ob_size*/ + "wstring", /*tp_name*/ + sizeof(PyWideString), /*tp_size*/ + sizeof(PyWideChar), /*tp_itemsize*/ + (destructor)PyWideString_Free, /*tp_dealloc*/ + 0, /*tp_print*/ + 0, /*tp_getattr : disabled by amk */ + /* (getattrfunc)PyWideString_GetAttr, /*tp_getattr*/ + 0, /*tp_setattr*/ + (cmpfunc)PyWideString_Compare, /*tp_compare*/ + (reprfunc)PyWideString_Repr, /*tp_repr*/ + 0, /*tp_as_number*/ + &PyWideString_AsSequence, /*tp_as_sequence*/ + 0, /*tp_as_mapping*/ + (hashfunc)PyWideString_Hash, /*tp_hash*/ + 0, /*tp_call*/ + 0, /*tp_str*/ + 0, /*tp_getattro*/ + 0, /*tp_setattro*/ + &wstring_as_buffer, /*tp_as_buffer*/ + 0, /*tp_xxx4*/ + 0, /*tp_doc*/ +}; + +#if 0 /* Not an extension module any more -- amk */ +static struct PyMethodDef PyWStrop_Methods[] = { + {"decode",(PyCFunction)PyWideString_Decode, 1, PyWideString_Decode_Doc}, + {"chr",(PyCFunction)PyWideString_chr, 1, PyWideString_chr_Doc}, + {"ord",(PyCFunction)PyWideString_ord, 1, PyWideString_ord_Doc}, + {"from_ucs2",(PyCFunction)PyWideString_FromUcs2, 1, PyWideString_FromUcs2_Doc}, + {"from_ucs4",(PyCFunction)PyWideString_FromUcs4, 1, PyWideString_FromUcs4_Doc}, + {"from_utf7",(PyCFunction)PyWideString_FromUtf7, 1, PyWideString_FromUtf7_Doc}, + {"from_utf16",(PyCFunction)PyWideString_FromUtf16, 1, PyWideString_FromUtf16_Doc}, + {NULL, NULL} +}; + +void +initwstrop() +{ + PyObject *m,*d,*s; + m=Py_InitModule4("wstrop",PyWStrop_Methods,PyWStrop_Doc, + 0,PYTHON_API_VERSION); + d = PyModule_GetDict(m); + /* Where should I put documentation for the constants? + For now, it goes into the module documentation... */ + ConvertError = PyString_FromString("wstrop.ConvertError"); + PyDict_SetItemString(d,"ConvertError",ConvertError); + PyDict_SetItemString(d,"SKIP_INVALID",PyInt_FromLong(SKIP_INVALID)); + PyDict_SetItemString(d,"UTF7_QUOTE_OPTIONALS", + PyInt_FromLong(UTF7_QUOTE_OPTIONALS)); + PyDict_SetItemString(d,"UCS_SWITCH_BYTEORDER", + PyInt_FromLong(UCS_SWITCH_BYTEORDER)); + PyDict_SetItemString(d,"UCS2_DO_UTF16", + PyInt_FromLong(UCS2_DO_UTF16)); + PyDict_SetItemString(d,"encodings",Encodings=PyDict_New()); + PyDict_SetItemString(d,"decodings",Decodings=PyDict_New()); + PyDict_SetItemString(d,"encoding_functions",EncodingFunctions=PyDict_New()); + PyDict_SetItemString(d,"decoding_functions",DecodingFunctions=PyDict_New()); + PyDict_SetItemString(d,"aliases",Aliases=PyDict_New()); + /*8859-1 aliases are builtin*/ + s=PyString_FromString("ISO_8859-1:1987"); + PyDict_SetItemString(Aliases,"ISO-IR-100",s); + Py_INCREF(s); /* the SetItemString ate the reference */ + PyDict_SetItemString(Aliases,"ISO_8859-1",s); + Py_INCREF(s); + PyDict_SetItemString(Aliases,"LATIN1",s); + Py_INCREF(s); + PyDict_SetItemString(Aliases,"L1",s); + Py_INCREF(s); + PyDict_SetItemString(Aliases,"IBM819",s); + Py_INCREF(s); + PyDict_SetItemString(Aliases,"CP819",s); + + if(PyErr_Occurred()) + Py_FatalError("Can't initialize module wstrop"); +} +#endif diff -urN -X /usr/src/excl.diff Python-1.5.orig/Python/bltinmodule.c Python-1.5/Python/bltinmodule.c --- Python-1.5.orig/Python/bltinmodule.c Wed Dec 10 06:51:47 1997 +++ Python-1.5/Python/bltinmodule.c Fri Apr 17 08:58:55 1998 @@ -516,6 +516,9 @@ "eval() argument 1 must be string or code object"); return NULL; } + if (PyWideString_Check(cmd) && + PyWideString_To8bitString((PyWideString *)cmd)==NULL) + {return NULL;} str = PyString_AsString(cmd); if ((int)strlen(str) != PyString_Size(cmd)) { PyErr_SetString(PyExc_ValueError, @@ -1586,6 +1589,38 @@ } static PyObject * +builtin_unicode(self, args) + PyObject *self; + PyObject *args; +{ + PyObject *v; + + if (!PyArg_ParseTuple(args, "O:unicode", &v)) + return NULL; + if (PyString_Check(v)) + return PyWideString_FromQString(v); + if (PyInt_Check(v)) + { + PyWideString *new_wstr; + PyWideChar w; + + long value = PyInt_AsLong(v); + if (value < 0 || value>65535) + { + PyErr_SetString(PyExc_ValueError, "unicode() takes numeric arguments between 0 and 65535"); + return NULL; + } + + w = (PyWideChar) value; + new_wstr=(PyWideString *)PyWideString_FromWideCharArray(&w, 1); + return (PyObject *)new_wstr; + } + PyErr_SetString(PyExc_TypeError, + "unicode() requires integer or string argument"); + return NULL; +} + +static PyObject * builtin_vars(self, args) PyObject *self; PyObject *args; @@ -1724,6 +1759,7 @@ {"str", builtin_str, 1}, {"tuple", builtin_tuple, 1}, {"type", builtin_type, 1}, + {"unicode", builtin_unicode, 1}, {"vars", builtin_vars, 1}, {"xrange", builtin_xrange, 1}, {NULL, NULL}, diff -urN -X /usr/src/excl.diff Python-1.5.orig/Python/ceval.c Python-1.5/Python/ceval.c --- Python-1.5.orig/Python/ceval.c Wed Dec 31 06:53:51 1997 +++ Python-1.5/Python/ceval.c Wed Apr 8 07:41:38 1998 @@ -2551,14 +2551,21 @@ PyObject *x; PySequenceMethods *sq; /* Special case for char in string */ - if (PyString_Check(w)) { + if (PyString_Check(w) && !PyWideString_Check(w)) { register char *s, *end; register char c; - if (!PyString_Check(v) || PyString_Size(v) != 1) { + if (!PyString_Check(v) || PyString_Size(v) != 1 ) { PyErr_SetString(PyExc_TypeError, "string member test needs char left operand"); return -1; } + if (PyWideString_Check(v) && + PyWideString_To8bitString((PyWideString *)v)==NULL) + { + PyErr_SetString(PyExc_TypeError, + "string member test can't use a wide character >255"); + return -1; + } c = PyString_AsString(v)[0]; s = PyString_AsString(w); end = s + PyString_Size(w); @@ -2804,6 +2811,9 @@ return -1; return 0; } + if (PyWideString_Check(prog) && + PyWideString_To8bitString((PyWideString *)prog)==NULL) + {return -1;} s = PyString_AsString(prog); if ((int)strlen(s) != PyString_Size(prog)) { PyErr_SetString(PyExc_ValueError, diff -urN -X /usr/src/excl.diff Python-1.5.orig/Python/getargs.c Python-1.5/Python/getargs.c --- Python-1.5.orig/Python/getargs.c Fri Dec 19 05:25:23 1997 +++ Python-1.5/Python/getargs.c Wed Apr 8 07:41:38 1998 @@ -539,7 +539,14 @@ { char *p = va_arg(*p_va, char *); if (PyString_Check(arg) && PyString_Size(arg) == 1) + { + if (PyWideString_Check(arg) && + PyWideString_To8bitString((PyWideString *)arg)==NULL) + { + return NULL; + } *p = PyString_AsString(arg)[0]; + } else return "char"; break; @@ -567,7 +574,18 @@ } else { char **p = va_arg(*p_va, char **); - if (PyString_Check(arg)) + if (PyWideString_Check(arg)) + { + if (NULL == + PyWideString_To8bitString((PyWideString *)arg)) + return "wide string"; + + /* The wide string has now been + transformed into an 8-bit + string */ + *p = PyString_AsString(arg); + } + else if (PyString_Check(arg)) *p = PyString_AsString(arg); else return "string"; @@ -607,7 +625,14 @@ if (arg == Py_None) *p = 0; else if (PyString_Check(arg)) - *p = PyString_AsString(arg); + { + /* XXX AMK should collapse here */ + if (PyWideString_Check(arg) && + NULL == PyWideString_To8bitString((PyWideString *)arg)) + return "wide string"; + + *p = PyString_AsString(arg); + } else return "None or string"; if (*format == '#') { diff -urN -X /usr/src/excl.diff Python-1.5.orig/UNICODE Python-1.5/UNICODE --- Python-1.5.orig/UNICODE Thu Jan 1 01:00:00 1970 +++ Python-1.5/UNICODE Wed Apr 8 07:41:38 1998 @@ -0,0 +1,170 @@ +This is a first cut at adding a wide string type to Python. It is +HIGHLY EXPERIMENTAL, so use at your own risk, and please make bug +reports and suggestions for improvements on the String-SIG +(string-sig@python.org). + +To compile this experimental version of Python, simply configure and +compile it in the usual way. unitest.py does some simple tests of the +new wide string type; please contribute new ones. + +These patches don't really follow Jim Huginin's proposal. In that +proposal, string objects would have an extra bit denoting whether +they're wide or regular. This patch takes a different tack, suggested +by GvR; implement wide strings as a separate type, change various +Python API functions to take either regular or wide strings, and tweak +some bits of Python to allow seamless mixing of both types. + +A stripped-down version of Martin von Loewis's wide string module +(wstrop) has been used as the basis for PyWideString; we can re-enable +capabilities and attributes from MvL's code in the future. Nothing's +been deleted, only #ifdef'ed out. Trouble spots in the code are +marked with XXX in comments, as usual in the Python source. (You +could also search for "amk".) + +Wide strings can be collapsed into 8-bit ones in place, with the +PyWideString_To8bitString() function; it raises an exception if the +string has any characters >255. This function does *not* use UTF8 or +any other encoding, but it probably should. PyArg_ParseTuple("s") +calls this function on wide strings, so you can pass a wide string +to built-ins such as open(), and to extension modules. + +What was done: + +New typedefs: PyWideChar, PyWideString + +Changes to Object/wstringobject.c: + Use PyWideChar instead of wchar_t + Rename type from PyWString to PyWideString + + New type for wide strings, converted in place by changing the +ob_type field of the object. PyWideString_To8bitString This second +type is Martin von Loewis's wstrop module, with lots of stuff +#ifdef'ed out and wchar_t changed to PyWideChar. Various function +names have been changed, and new ones added. + +Modify Objects/object.c: + Ensure that compares work properly; 'a' == unicode('a') + +Modify builtinmodule.c: + Added a unicode() function that takes a plain string and +returns an expanded version of it. (Perhaps it should be renamed wide()?) + str(wide) leaves widestrings alone + repr(wide) is the original wstrop function, and returns +'unicode("...")', with the UTF-8 encoding of the string. This is +inconsistent with how the unicode() function works. + +Python/getargs.c: + ParseTuple("s") collapses the string, raises an exception if it can't +be converted cleanly. + s# : returns the actual data pointer, & the data area's length in +bytes (2 * len(), if sizeof(PyWideChar)==2) + +TODO: + +test_wide.py reports lots of errors. Why? + +How do wide strings interact with interning? + +Missing functions from wstringobject.c: + PyString_Format (copy implementation from stringobject.c?) + PyString_Concat, PyString_ConcatAndDel, _PyString_Resize (needed?) + +Modify Modules/stropmodule.c to work on both types of string + +Modify fileobject.c: + .write() method: collapse the string first (?) + +marshal.c, cPickle.c : Need to handle the wide strings + +Different encodings need to be available somehow; any suggestions for +an interface? + +How compatible is this with JPython? + +The patches aren't in the proper C indentation format, because I don't +have it available on my home computer. +This is a first cut at adding a wide string type to Python. It is +HIGHLY EXPERIMENTAL, so use at your own risk, and please make bug +reports and suggestions for improvements on the String-SIG +(string-sig@python.org). + +To compile this experimental version of Python, simply configure and +compile it in the usual way. unitest.py does some simple tests of the +new wide string type; please contribute new ones. + +These patches don't really follow Jim Huginin's proposal. In that +proposal, string objects would have an extra bit denoting whether +they're wide or regular. This patch takes a different tack, suggested +by GvR; implement wide strings as a separate type, change various +Python API functions to take either regular or wide strings, and tweak +some bits of Python to allow seamless mixing of both types. + +A stripped-down version of Martin von Loewis's wide string module +(wstrop) has been used as the basis for PyWideString; we can re-enable +capabilities and attributes from MvL's code in the future. Nothing's +been deleted, only #ifdef'ed out. Trouble spots in the code are +marked with XXX in comments, as usual in the Python source. (You +could also search for "amk".) + +Wide strings can be collapsed into 8-bit ones in place, with the +PyWideString_To8bitString() function; it raises an exception if the +string has any characters >255. This function does *not* use UTF8 or +any other encoding, but it probably should. PyArg_ParseTuple("s") +calls this function on wide strings, so you can pass a wide string +to built-ins such as open(), and to extension modules. + +What was done: + +New typedefs: PyWideChar, PyWideString + +Changes to Object/wstringobject.c: + Use PyWideChar instead of wchar_t + Rename type from PyWString to PyWideString + + New type for wide strings, converted in place by changing the +ob_type field of the object. PyWideString_To8bitString This second +type is Martin von Loewis's wstrop module, with lots of stuff +#ifdef'ed out and wchar_t changed to PyWideChar. Various function +names have been changed, and new ones added. + +Modify Objects/object.c: + Ensure that compares work properly; 'a' == unicode('a') + +Modify builtinmodule.c: + Added a unicode() function that takes a plain string and +returns an expanded version of it. (Perhaps it should be renamed wide()?) + str(wide) leaves widestrings alone + repr(wide) is the original wstrop function, and returns +'unicode("...")', with the UTF-8 encoding of the string. This is +inconsistent with how the unicode() function works. + +Python/getargs.c: + ParseTuple("s") collapses the string, raises an exception if it can't +be converted cleanly. + s# : returns the actual data pointer, & the data area's length in +bytes (2 * len(), if sizeof(PyWideChar)==2) + +TODO: + +test_wide.py reports lots of errors. Why? + +How do wide strings interact with interning? + +Missing functions from wstringobject.c: + PyString_Format (copy implementation from stringobject.c?) + PyString_Concat, PyString_ConcatAndDel, _PyString_Resize (needed?) + +Modify Modules/stropmodule.c to work on both types of string + +Modify fileobject.c: + .write() method: collapse the string first (?) + +marshal.c, cPickle.c : Need to handle the wide strings + +Different encodings need to be available somehow; any suggestions for +an interface? + +How compatible is this with JPython? + +The patches aren't in the proper C indentation format, because I don't +have it available on my home computer. diff -urN -X /usr/src/excl.diff Python-1.5.orig/code.unused Python-1.5/code.unused --- Python-1.5.orig/code.unused Thu Jan 1 01:00:00 1970 +++ Python-1.5/code.unused Wed Apr 8 07:41:38 1998 @@ -0,0 +1,92 @@ + +I've got an inconsistency that needs resolving. Originally the +unicode() built-in (wonder if it should be named wide()?) would take +an ordinary string, and return a wide version of it. That is, +unicode('foo') returns a 3-character widestring 'f', 'o', 'o'. This +means that any string returned from unicode() has no values greater +than 255. But repr() returns "unicode('...')", which can't produce a +widestring with any characters >255, so if S contains a character like +2000, eval(repr(S)) != S. + + + Practically, this will probably be resolved when different +encodings are implemented; repr() would return unicode('...', utf8) or +whatever. + +Bad repr() below: + + int i, pos; + + /* XXX repr() isn't very understandable when printed ; I took the + easy way out */ + /* surrounded by unicode() */ + result=(PyStringObject *)PyString_FromStringAndSize(0,a->ob_size * 8 + 9); + if(!result){ + return 0; + } + sprintf(result->ob_sval,"unicode('"); + pos=strlen(result->ob_sval); + + for(i = 0; i < a->ob_size; i++) + { + PyWideChar c = a->string[i] / 256; + result->ob_sval[ pos++ ] = '\\'; + result->ob_sval[ pos++ ] = '0' + (c / 64); + result->ob_sval[ pos++ ] = '0' + ( (c / 8) % 8); + result->ob_sval[ pos++ ] = '0' + (c % 8); + + c = a->string[i] % 256; + result->ob_sval[ pos++ ] = '\\'; + result->ob_sval[ pos++ ] = '0' + (c / 64); + result->ob_sval[ pos++ ] = '0' + ( (c / 8) % 8); + result->ob_sval[ pos++ ] = '0' + (c % 8); + } + result->ob_sval[ pos ] = '\0'; + return (PyObject *)result; + + +I've got an inconsistency that needs resolving. Originally the +unicode() built-in (wonder if it should be named wide()?) would take +an ordinary string, and return a wide version of it. That is, +unicode('foo') returns a 3-character widestring 'f', 'o', 'o'. This +means that any string returned from unicode() has no values greater +than 255. But repr() returns "unicode('...')", which can't produce a +widestring with any characters >255, so if S contains a character like +2000, eval(repr(S)) != S. + + + Practically, this will probably be resolved when different +encodings are implemented; repr() would return unicode('...', utf8) or +whatever. + +Bad repr() below: + + int i, pos; + + /* XXX repr() isn't very understandable when printed ; I took the + easy way out */ + /* surrounded by unicode() */ + result=(PyStringObject *)PyString_FromStringAndSize(0,a->ob_size * 8 + 9); + if(!result){ + return 0; + } + sprintf(result->ob_sval,"unicode('"); + pos=strlen(result->ob_sval); + + for(i = 0; i < a->ob_size; i++) + { + PyWideChar c = a->string[i] / 256; + result->ob_sval[ pos++ ] = '\\'; + result->ob_sval[ pos++ ] = '0' + (c / 64); + result->ob_sval[ pos++ ] = '0' + ( (c / 8) % 8); + result->ob_sval[ pos++ ] = '0' + (c % 8); + + c = a->string[i] % 256; + result->ob_sval[ pos++ ] = '\\'; + result->ob_sval[ pos++ ] = '0' + (c / 64); + result->ob_sval[ pos++ ] = '0' + ( (c / 8) % 8); + result->ob_sval[ pos++ ] = '0' + (c % 8); + } + result->ob_sval[ pos ] = '\0'; + return (PyObject *)result; + diff -urN -X /usr/src/excl.diff Python-1.5.orig/unitest.py Python-1.5/unitest.py --- Python-1.5.orig/unitest.py Thu Jan 1 01:00:00 1970 +++ Python-1.5/unitest.py Wed Apr 8 07:41:38 1998 @@ -0,0 +1,94 @@ + +print "Testing widestring type" + +a=unicode("") +UnicodeType = type(a) + +assert type( unicode("a") + 'foo') == UnicodeType + +print ' Testing comparisions' + +for i in range(0, 256): + # Test comparisions + assert unicode(i) == chr(i) + assert unicode( chr(i) ) == chr(i) + + # Try comparisions in the other direction, too + assert chr(i) == unicode(i) + assert chr(i) == unicode( chr(i) ) + + # Compare the hashes + assert hash(chr(i)) == hash(unicode(i)) + +assert 'a' in 'abc' +assert unicode('a') in 'abc' +assert 'a' in unicode('abc') +assert unicode('a') in unicode('abc') + +print ' Using a wide string as a filename' + +filename = unicode('/tmp/py-unicode') +f = open(filename, 'w') +f.write('testing ... this should be in the file ' + filename+'\n') +f.write( unicode('Unicode string\n') ) # Will have zero bytes mixed in +f.close() + +print ' Testing some string module functions' +import string +haystack = 'A dark mirror... That was always the intention...' +needle = 'mirror' + +pos = string.find(haystack, needle) +assert pos == string.find( unicode(haystack), needle) +assert pos == string.find( haystack, unicode(needle)) +assert pos == string.find( unicode(haystack), unicode(needle)) + +import test_wide + + +print "Testing widestring type" + +a=unicode("") +UnicodeType = type(a) + +assert type( unicode("a") + 'foo') == UnicodeType + +print ' Testing comparisions' + +for i in range(0, 256): + # Test comparisions + assert unicode(i) == chr(i) + assert unicode( chr(i) ) == chr(i) + + # Try comparisions in the other direction, too + assert chr(i) == unicode(i) + assert chr(i) == unicode( chr(i) ) + + # Compare the hashes + assert hash(chr(i)) == hash(unicode(i)) + +assert 'a' in 'abc' +assert unicode('a') in 'abc' +assert 'a' in unicode('abc') +assert unicode('a') in unicode('abc') + +print ' Using a wide string as a filename' + +filename = unicode('/tmp/py-unicode') +f = open(filename, 'w') +f.write('testing ... this should be in the file ' + filename+'\n') +f.write( unicode('Unicode string\n') ) # Will have zero bytes mixed in +f.close() + +print ' Testing some string module functions' +import string +haystack = 'A dark mirror... That was always the intention...' +needle = 'mirror' + +pos = string.find(haystack, needle) +assert pos == string.find( unicode(haystack), needle) +assert pos == string.find( haystack, unicode(needle)) +assert pos == string.find( unicode(haystack), unicode(needle)) + +import test_wide + diff -urN -X /usr/src/excl.diff Python-1.5.orig/utkdemo.py Python-1.5/utkdemo.py --- Python-1.5.orig/utkdemo.py Thu Jan 1 01:00:00 1970 +++ Python-1.5/utkdemo.py Sun Apr 19 08:40:48 1998 @@ -0,0 +1,13 @@ +import Tkinter + +Tkinter.Label(text=unicode("This is the \u0391 and \u03A9")).pack() + +def quit(): + raise SystemExit + +msg=unicode("\u0414\u043E \u0441\u0432\u0438\u0434\u0430\u043d\u0438\u044f") +b=Tkinter.Button(text=msg,command=quit) +b.pack() + +b.mainloop() +