LMMS
Loading...
Searching...
No Matches
wdlutf8.h
Go to the documentation of this file.
1/*
2WDL - wdlutf8.h
3Copyright (C) 2005 and later, Cockos Incorporated
4
5This software is provided 'as-is', without any express or implied
6warranty. In no event will the authors be held liable for any damages
7arising from the use of this software.
8
9Permission is granted to anyone to use this software for any purpose,
10including commercial applications, and to alter it and redistribute it
11freely, subject to the following restrictions:
12
131. The origin of this software must not be misrepresented; you must not
14claim that you wrote the original software. If you use this software
15in a product, an acknowledgment in the product documentation would be
16appreciated but is not required.
172. Altered source versions must be plainly marked as such, and must not be
18misrepresented as being the original software.
193. This notice may not be removed or altered from any source distribution.
20
21*/
22
23#ifndef _WDLUTF8_H_
24#define _WDLUTF8_H_
25
26/* todo: handle overlongs?
27 * todo: handle multi-byte (make WideStr support UTF-16)
28 */
29
30#include "wdltypes.h"
31
32#ifndef WDL_WCHAR
33 #ifdef _WIN32
34 #define WDL_WCHAR WCHAR
35 #else
36 // this is often 4 bytes on macOS/linux! beware dragons!
37 #define WDL_WCHAR wchar_t
38 #endif
39#endif
40
41
42// returns size, sets cOut to code point.
43// if invalid ITF-8, sets cOut to first character (as unsigned char).
44// cOut may be NULL if you only want the size of the character
45static int WDL_STATICFUNC_UNUSED wdl_utf8_parsechar(const char *rd, int *cOut)
46{
47 const unsigned char *p = (const unsigned char *)rd;
48 const unsigned char b0 = *p;
49 unsigned char b1,b2,b3;
50
51 if (cOut) *cOut = b0;
52 if (b0 < 0x80)
53 {
54 return 1;
55 }
56 if (((b1=p[1])&0xC0) != 0x80) return 1;
57
58 if (b0 < 0xE0)
59 {
60 if (!(b0&0x1E)) return 1; // detect overlong
61 if (cOut) *cOut = ((b0&0x1F)<<6)|(b1&0x3F);
62 return 2;
63 }
64
65 if (((b2=p[2])&0xC0) != 0x80) return 1;
66
67 if (b0 < 0xF0)
68 {
69 if (!(b0&0xF) && !(b1&0x20)) return 1; // detect overlong
70
71 if (cOut) *cOut = ((b0&0x0F)<<12)|((b1&0x3F)<<6)|(b2&0x3f);
72 return 3;
73 }
74
75 if (((b3=p[3])&0xC0) != 0x80) return 1;
76
77 if (b0 < 0xF8)
78 {
79 if (!(b0&0x7) && !(b1&0x30)) return 1; // detect overlong
80
81 if (cOut) *cOut = ((b0&7)<<18)|((b1&0x3F)<<12)|((b2&0x3F)<<6)|(b3&0x3F);
82 return 4;
83 }
84
85 // UTF-8 does not actually support 5-6 byte sequences as of 2003 (RFC-3629)
86 // skip them and return _
87 if ((p[4]&0xC0) != 0x80) return 1;
88 if (b0 < 0xFC)
89 {
90 if (cOut) *cOut = '_';
91 return 5;
92 }
93
94 if ((p[5]&0xC0) != 0x80) return 1;
95 if (cOut) *cOut = '_';
96 return 6;
97}
98
99
100// makes a character, returns length. does NOT nul terminate.
101// returns 0 if insufficient space, -1 if out of range value
102static int WDL_STATICFUNC_UNUSED wdl_utf8_makechar(int c, char *dest, int dest_len)
103{
104 if (c < 0) return -1; // out of range character
105
106 if (c < 0x80)
107 {
108 if (dest_len<1) return 0;
109 dest[0]=(char)c;
110 return 1;
111 }
112 if (c < 0x800)
113 {
114 if (dest_len < 2) return 0;
115
116 dest[0]=0xC0|(c>>6);
117 dest[1]=0x80|(c&0x3F);
118 return 2;
119 }
120 if (c < 0x10000)
121 {
122 if (dest_len < 3) return 0;
123
124 dest[0]=0xE0|(c>>12);
125 dest[1]=0x80|((c>>6)&0x3F);
126 dest[2]=0x80|(c&0x3F);
127 return 3;
128 }
129 if (c < 0x200000)
130 {
131 if (dest_len < 4) return 0;
132 dest[0]=0xF0|(c>>18);
133 dest[1]=0x80|((c>>12)&0x3F);
134 dest[2]=0x80|((c>>6)&0x3F);
135 dest[3]=0x80|(c&0x3F);
136 return 4;
137 }
138
139 return -1;
140}
141
142
143// invalid UTF-8 are now treated as ANSI characters for this function
144static int WDL_STATICFUNC_UNUSED WDL_MBtoWideStr(WDL_WCHAR *dest, const char *src, int destlenbytes)
145{
146 WDL_WCHAR *w = dest, *dest_endp = dest+(size_t)destlenbytes/sizeof(WDL_WCHAR)-1;
147 if (!dest || destlenbytes < 1) return 0;
148
149 if (src) for (; *src && w < dest_endp; )
150 {
151 int c,sz=wdl_utf8_parsechar(src,&c);
152 *w++ = c;
153 src+=sz;
154 }
155 *w=0;
156 return (int)(w-dest);
157}
158
159
160// like wdl_utf8_makechar, except nul terminates and handles errors differently (returns _ and 1 on errors)
161// negative values for character are treated as 0.
162static int WDL_STATICFUNC_UNUSED WDL_MakeUTFChar(char* dest, int c, int destlen)
163{
164 if (destlen < 2)
165 {
166 if (destlen == 1) dest[0]=0;
167 return 0;
168 }
169 else
170 {
171 const int v = wdl_utf8_makechar(c>0?c:0,dest,destlen-1);
172 if (v < 1) // implies either insufficient space or out of range character
173 {
174 dest[0]='_';
175 dest[1]=0;
176 return 1;
177 }
178 dest[v]=0;
179 return v;
180 }
181}
182
183static int WDL_STATICFUNC_UNUSED WDL_WideToMBStr(char *dest, const WDL_WCHAR *src, int destlenbytes)
184{
185 char *p = dest, *dest_endp = dest + destlenbytes - 1;
186 if (!dest || destlenbytes < 1) return 0;
187
188 if (src) while (*src && p < dest_endp)
189 {
190 const int v = wdl_utf8_makechar(*src++,p,(int)(dest_endp-p));
191 if (v > 0)
192 {
193 p += v;
194 }
195 else if (v == 0) break; // out of space
196 }
197 *p=0;
198 return (int)(p-dest);
199}
200
201// returns >0 if UTF-8, -1 if 8-bit chars occur that are not UTF-8, or 0 if ASCII
202static int WDL_STATICFUNC_UNUSED WDL_DetectUTF8(const char *str)
203{
204 int hasUTF=0;
205
206 if (!str) return 0;
207
208 for (;;)
209 {
210 const unsigned char c = *(const unsigned char *)str;
211
212 if (c < 0xC2 || c > 0xF7)
213 {
214 if (!c) return hasUTF;
215 if (c >= 0x80) return -1;
216 str++;
217 }
218 else
219 {
220 const int l = wdl_utf8_parsechar(str,NULL);
221 if (l < 2) return -1; // wdl_utf8_parsechar returns length=1 if it couldn't parse UTF-8 properly
222 str+=l;
223 hasUTF=1;
224 }
225 }
226}
227
228
229static int WDL_STATICFUNC_UNUSED WDL_utf8_charpos_to_bytepos(const char *str, int charpos)
230{
231 int bpos = 0;
232 while (charpos-- > 0 && str[bpos])
233 {
234 bpos += wdl_utf8_parsechar(str+bpos,NULL);
235 }
236 return bpos;
237}
238static int WDL_STATICFUNC_UNUSED WDL_utf8_bytepos_to_charpos(const char *str, int bytepos)
239{
240 int bpos = 0, cpos=0;
241 while (bpos < bytepos && str[bpos])
242 {
243 bpos += wdl_utf8_parsechar(str+bpos,NULL);
244 cpos++;
245 }
246 return cpos;
247}
248
249#define WDL_utf8_get_charlen(rd) WDL_utf8_bytepos_to_charpos((rd), 0x7fffffff)
250
251#endif
#define NULL
Definition CarlaBridgeFormat.cpp:30
UINT_D64 w
Definition inflate.c:942
int * l
Definition inflate.c:1579
unsigned v[N_MAX]
Definition inflate.c:1584
uch * p
Definition crypt.c:594
return c
Definition crypt.c:175
#define WDL_STATICFUNC_UNUSED
Definition wdltypes.h:87
#define WDL_WCHAR
Definition wdlutf8.h:37
static int WDL_STATICFUNC_UNUSED WDL_utf8_charpos_to_bytepos(const char *str, int charpos)
Definition wdlutf8.h:229
static int WDL_STATICFUNC_UNUSED WDL_MakeUTFChar(char *dest, int c, int destlen)
Definition wdlutf8.h:162
static int WDL_STATICFUNC_UNUSED wdl_utf8_parsechar(const char *rd, int *cOut)
Definition wdlutf8.h:45
static int WDL_STATICFUNC_UNUSED WDL_DetectUTF8(const char *str)
Definition wdlutf8.h:202
static int WDL_STATICFUNC_UNUSED WDL_WideToMBStr(char *dest, const WDL_WCHAR *src, int destlenbytes)
Definition wdlutf8.h:183
static int WDL_STATICFUNC_UNUSED WDL_MBtoWideStr(WDL_WCHAR *dest, const char *src, int destlenbytes)
Definition wdlutf8.h:144
static int WDL_STATICFUNC_UNUSED WDL_utf8_bytepos_to_charpos(const char *str, int bytepos)
Definition wdlutf8.h:238
static int WDL_STATICFUNC_UNUSED wdl_utf8_makechar(int c, char *dest, int dest_len)
Definition wdlutf8.h:102