unix/fiss

lib/libutf/rune.c in master
Repositories | Summary | Log | Files | LICENSE

rune.c (4071B) download


  1/*
  2 * The authors of this software are Rob Pike and Ken Thompson.
  3 *              Copyright (c) 2002 by Lucent Technologies.
  4 * Permission to use, copy, modify, and distribute this software for any
  5 * purpose without fee is hereby granted, provided that this entire notice
  6 * is included in all copies of any software which is or includes a copy
  7 * or modification of this software and in all copies of the supporting
  8 * documentation for such software.
  9 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
 10 * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
 11 * ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
 12 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
 13 */
 14#include "plan9.h"
 15#include "utf.h"
 16
 17#include <stdarg.h>
 18#include <string.h>
 19
 20enum {
 21	Bit1 = 7,
 22	Bitx = 6,
 23	Bit2 = 5,
 24	Bit3 = 4,
 25	Bit4 = 3,
 26	Bit5 = 2,
 27
 28	T1 = ((1 << (Bit1 + 1)) - 1) ^ 0xFF, /* 0000 0000 */
 29	Tx = ((1 << (Bitx + 1)) - 1) ^ 0xFF, /* 1000 0000 */
 30	T2 = ((1 << (Bit2 + 1)) - 1) ^ 0xFF, /* 1100 0000 */
 31	T3 = ((1 << (Bit3 + 1)) - 1) ^ 0xFF, /* 1110 0000 */
 32	T4 = ((1 << (Bit4 + 1)) - 1) ^ 0xFF, /* 1111 0000 */
 33	T5 = ((1 << (Bit5 + 1)) - 1) ^ 0xFF, /* 1111 1000 */
 34
 35	Rune1 = (1 << (Bit1 + 0 * Bitx)) - 1, /* 0000 0000 0000 0000 0111 1111 */
 36	Rune2 = (1 << (Bit2 + 1 * Bitx)) - 1, /* 0000 0000 0000 0111 1111 1111 */
 37	Rune3 = (1 << (Bit3 + 2 * Bitx)) - 1, /* 0000 0000 1111 1111 1111 1111 */
 38	Rune4 = (1 << (Bit4 + 3 * Bitx)) - 1, /* 0011 1111 1111 1111 1111 1111 */
 39
 40	Maskx = (1 << Bitx) - 1, /* 0011 1111 */
 41	Testx = Maskx ^ 0xFF,    /* 1100 0000 */
 42
 43	Bad = Runeerror
 44};
 45
 46int chartorune(Rune* rune, char* str) {
 47	int  c, c1, c2, c3;
 48	long l;
 49
 50	/*
 51	 * one character sequence
 52	 *	00000-0007F => T1
 53	 */
 54	c = *(uchar*) str;
 55	if (c < Tx) {
 56		*rune = c;
 57		return 1;
 58	}
 59
 60	/*
 61	 * two character sequence
 62	 *	0080-07FF => T2 Tx
 63	 */
 64	c1 = *(uchar*) (str + 1) ^ Tx;
 65	if (c1 & Testx)
 66		goto bad;
 67	if (c < T3) {
 68		if (c < T2)
 69			goto bad;
 70		l = ((c << Bitx) | c1) & Rune2;
 71		if (l <= Rune1)
 72			goto bad;
 73		*rune = l;
 74		return 2;
 75	}
 76
 77	/*
 78	 * three character sequence
 79	 *	0800-FFFF => T3 Tx Tx
 80	 */
 81	c2 = *(uchar*) (str + 2) ^ Tx;
 82	if (c2 & Testx)
 83		goto bad;
 84	if (c < T4) {
 85		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
 86		if (l <= Rune2)
 87			goto bad;
 88		*rune = l;
 89		return 3;
 90	}
 91
 92	/*
 93	 * four character sequence
 94	 *	10000-10FFFF => T4 Tx Tx Tx
 95	 */
 96	if (UTFmax >= 4) {
 97		c3 = *(uchar*) (str + 3) ^ Tx;
 98		if (c3 & Testx)
 99			goto bad;
100		if (c < T5) {
101			l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
102			if (l <= Rune3)
103				goto bad;
104			if (l > Runemax)
105				goto bad;
106			*rune = l;
107			return 4;
108		}
109	}
110
111	/*
112	 * bad decoding
113	 */
114bad:
115	*rune = Bad;
116	return 1;
117}
118
119int runetochar(char* str, Rune* rune) {
120	long c;
121
122	/*
123	 * one character sequence
124	 *	00000-0007F => 00-7F
125	 */
126	c = *rune;
127	if (c <= Rune1) {
128		str[0] = c;
129		return 1;
130	}
131
132	/*
133	 * two character sequence
134	 *	00080-007FF => T2 Tx
135	 */
136	if (c <= Rune2) {
137		str[0] = T2 | (c >> 1 * Bitx);
138		str[1] = Tx | (c & Maskx);
139		return 2;
140	}
141
142	/*
143	 * three character sequence
144	 *	00800-0FFFF => T3 Tx Tx
145	 */
146	if (c > Runemax)
147		c = Runeerror;
148	if (c <= Rune3) {
149		str[0] = T3 | (c >> 2 * Bitx);
150		str[1] = Tx | ((c >> 1 * Bitx) & Maskx);
151		str[2] = Tx | (c & Maskx);
152		return 3;
153	}
154
155	/*
156	 * four character sequence
157	 *	010000-1FFFFF => T4 Tx Tx Tx
158	 */
159	str[0] = T4 | (c >> 3 * Bitx);
160	str[1] = Tx | ((c >> 2 * Bitx) & Maskx);
161	str[2] = Tx | ((c >> 1 * Bitx) & Maskx);
162	str[3] = Tx | (c & Maskx);
163	return 4;
164}
165
166int runelen(long c) {
167	Rune rune;
168	char str[10];
169
170	rune = c;
171	return runetochar(str, &rune);
172}
173
174int runenlen(Rune* r, int nrune) {
175	int nb, c;
176
177	nb = 0;
178	while (nrune--) {
179		c = *r++;
180		if (c <= Rune1)
181			nb++;
182		else if (c <= Rune2)
183			nb += 2;
184		else if (c <= Rune3 || c > Runemax)
185			nb += 3;
186		else
187			nb += 4;
188	}
189	return nb;
190}
191
192int fullrune(char* str, int n) {
193	int c;
194
195	if (n <= 0)
196		return 0;
197	c = *(uchar*) str;
198	if (c < Tx)
199		return 1;
200	if (c < T3)
201		return n >= 2;
202	if (UTFmax == 3 || c < T4)
203		return n >= 3;
204	return n >= 4;
205}