通过 C 中的管道读取表情符号
Reading Emojis through a pipe in C
我有一个管道,其中写入了无穷无尽的字符串。这些字符串是 ASCII 和表情符号的混合体。我遇到的问题是我正在这样阅读它们
char msg[100];
int length = read(fd,&msg,99);
msg[length] =0;
但有时我猜测的表情符号是多字节的,它被切成两半,然后当我打印到屏幕上时,我得到菱形问号未知的 UTF-8 符号。
如果有人知道如何防止这种情况,请告诉我;我已经找了一段时间了。
如果您正在读取字节块,并希望输出 UTF-8 块,您必须自己至少进行一些最小的 UTF-8 解码。要检查的最简单条件是查看每个字节(我们称它为 b)并查看它是否是连续字节:
bool is_cont = (0x80 == (0xC0 & b));
任何不是延续的字节都开始一个序列,该序列一直持续到下一个非延续字节。您需要一个 4 字节的缓冲区来保存块。
下面的示例代码使用标准输入,但您可以取消注释 fdopen(fd, "r");
以改用 fd 管道。
这是一个超级简单的示例,说明如何执行此操作。它可能会慢一点,但我会先尝试一下,看看它是否满足您的需求。您还可以使用 fgetws()
.
读取更大的块
下面的程序将读取 UTF8 字符并正确打印出来。
#include <stdio.h>
#include <stdlib.h>
#include <wchar.h>
#include <locale.h>
int main(void)
{
FILE *input_stream = stdin; //or fdopen(fd, "r");
FILE *output_stream = stdout;
setlocale(LC_ALL, "en_US.utf8");
fputws(L"Program started\n", output_stream); //note the wide string `L` prefix
wint_t wc;
while ((wc = fgetwc(input_stream)) != WEOF) {
//use CTRL+D to send WEOF to stdin
fputwc(wc, output_stream);
}
fputws(L"Program ended\n", output_stream); //note the wide string `L` prefix
//note that this example omits error handling for writing output and setlocale()
return EXIT_SUCCESS;
}
也可以与管道一起使用:
$ echo "hello. кошкâ" | ./a.out
Program started
hello. кошкâ
Program ended
lee-daniel-crocker 提供的提示可以很好地检查天气特定字节是否是 utf-8/utf-16 的一部分。
除此之外,您还需要添加更多逻辑。当您在流的末尾找到 utf-8 的部分序列时,您需要在流中回头查看(这里是缓冲区)以定位此部分序列的开始位置。
一旦找到此部分 utf-8 代码序列的开始位置,存储此部分代码,将其从缓冲区和处理缓冲区中删除。将该部分代码序列添加到下一个读取周期的缓冲区中。这将允许您将部分 utf-8 代码序列拆分为 read()
操作。
下面是用于测试和验证的示例代码。
App.c
// gcc -Wall app.c
#include <fcntl.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
volatile sig_atomic_t g_process_run = 1;
void signal_handler(int signal) { g_process_run = 0; }
int child_process(int *pipe) {
close(pipe[0]); // close read pipe
srand(1234);
int chars_to_send[] = {95, 97, 99, 100, 101, 103, 104, 105,
95, 97, 99, 100, 101, 103, 104, 105};
// int chars_to_send[] = {6, 7, 8, 9,12,14,15,16};
int fd = open("a.txt", O_RDONLY);
if (fd == -1) {
printf("Child: can't open file\n");
return -1;
}
struct stat sb;
if (fstat(fd, &sb) == -1) {
printf("Child: can't get file stat\n");
return -1;
}
off_t file_size = sb.st_size;
char *addr = mmap(NULL, file_size, PROT_READ, MAP_PRIVATE, fd, 0);
if (addr == MAP_FAILED) {
printf("Child:mmap failed");
return -1;
}
int start_address = 0;
while (g_process_run != 0) {
long index = rand();
index = (index * 16) / RAND_MAX;
int len = chars_to_send[index];
if (start_address + len > file_size) {
start_address = 0;
}
len = write(pipe[1], &addr[start_address], len);
start_address = start_address + len;
sleep(1);
}
munmap(addr, file_size);
close(fd);
close(pipe[1]);
printf("child process exiting\n");
return 0;
}
int parent_process(int *pipe) {
close(pipe[1]); // close write pipe
const int BUFF_SIZE = 99;
char buff[BUFF_SIZE + 1];
char buff_temp[10];
int continueCount = 0;
while (g_process_run != 0) {
int len = read(pipe[0], &buff[continueCount],
BUFF_SIZE - continueCount) +
continueCount; // addjust buffer position and size based
// on previous partial utf-8 sequence
continueCount = 0;
for (int i = len - 1; i > -1;
--i) { // find and save if last sequence are partial utf-8
if (0 != (0x80 & buff[i])) {
buff_temp[continueCount] = buff[i];
buff[i] = '[=10=]';
continueCount++;
} else {
break;
}
}
buff[len] = '[=10=]';
printf("Parent:%s\n", buff);
if (continueCount > 0) { // put partial utf-8 sequence to start of buffer,
// so it will prepend in next read cycle.
printf("will resume with %d partial bytes\n", continueCount);
for (int i = 0; i < continueCount; ++i) {
buff[i] = buff_temp[continueCount - i - 1];
}
}
}
close(pipe[0]);
wait(NULL);
printf("parent process exiting\n");
return 0;
}
int init_signal() {
if (signal(SIGINT, signal_handler) == SIG_ERR) {
return -1;
}
return 0;
}
int main(int argc, char **argv) {
if (init_signal() != 0)
return -1;
int pipefd[2];
if (pipe(pipefd) == -1) {
printf("can't create pipe\n");
return -1;
}
pid_t pid = fork();
if (pid == -1) {
printf("Can't fork process\n");
return -1;
} else if (pid == 0) { // child process
return child_process(pipefd);
}
return parent_process(pipefd);
}
a.txt
12abc️a23️s345️24ee️dai️iodqs️dqk️pdoo9️93wd️qd3️2om️de9️312abc️a23️s345️24ee️dai️iodqs️dqk️pdoo9️93wd️qd3️2om️de9️312abc️a23️s345️24ee️dai️iodqs️dqk️pdoo9️93wd️qd3️2om️de9️312abc️a23️s345️24ee️dai️iodqs️dqk️pdoo9️93wd️qd3️2om️de9️312abc️a23️s345️24ee️dai️iodqs️dqk️pdoo9️93wd️qd3️2om️de9️312abc️a23️s345️24ee️dai️iodqs️dqk️pdoo9️93wd️qd3️2om️de9️312abc️a23️s345️24ee️dai️iodqs️dqk️pdoo9️93wd️qd3️2om️de9️312abc️a23️s345️24ee️dai️iodqs️dqk️pdoo9️93wd️qd3️2om️de9️3
您可以找到此代码和测试文件 here。
我会选择类似的东西:
#include <stdio.h>
#include <unistd.h>
#define BUFFER_LENGTH 53
void print_function(char* message) {
// \r or 0x0d - UTF-8 carriage return
printf("%s\r", message);
}
void read_pipe(int pipe, void (*print_func)(char*))
{
char message[BUFFER_LENGTH];
char to_print[1 + BUFFER_LENGTH];
char* pointer = message;
do
{
int bytes_read = read(pipe, pointer, BUFFER_LENGTH - (pointer - message));
if (0 == bytes_read)
{
// print remaining bytes
*pointer = '[=10=]';
print_func(message);
break;
}
// add bytes remained from previous run
bytes_read += (pointer - message);
// copy complete characters to buffer to_print
int char_p = 0;
char* to_print_p = to_print;
for (int i = 0; i != bytes_read; ++i)
{
if (0x80 != (0xc0 & *(message + i)))
{
for (; char_p != i; ++char_p)
{
*(to_print_p++) = *(message + char_p);
}
}
}
// finish buffer with complete characters and print it
*to_print_p = '[=10=]';
print_func(to_print);
// move tail to the beginning of the input buffer,
// pointer will point to the first free element in message buffer
pointer = message;
for (; char_p != bytes_read; ++char_p)
{
*(pointer++) = *(message + char_p);
}
} while (1);
}
int main()
{
read_pipe(STDIN_FILENO, print_function);
return 0;
}
此处 read_pipe 从传递的 pipe
描述符中无限读取,并使用传递的 print_func
函数打印数据。
想法是从管道读取缓冲区,然后仅复制到打印缓冲区的完整字符(条件由 Lee Daniel Crocker 提供),假设存在有效的 UTF-8 序列。如果缓冲区有一些不完整的 UTF-8 字符的尾部,它将被用作下一部分数据的开头。所以我们循环直到管道结束。
为简单起见,我使用 stdin
作为管道描述符。至 运行 并测试:
gcc -Wall main.c -o run && perl -e 'print "\xf0\x9f\x98\xab"x1000;' > test.txt && ./run < test.txt > output.txt
P.S。另一种方法是获取字符长度,如下所述:UTF-8 Continuation bytes:
#include <stdio.h>
#include <unistd.h>
#define BUFFER_LENGTH 53
void print_function(char* message) {
// \r or 0x0d - UTF-8 carriage return
printf("%s\n", message);
}
void read_pipe(int pipe, void (*print_func)(char*))
{
char message[BUFFER_LENGTH];
char to_print[1 + BUFFER_LENGTH];
char* pointer = message;
do
{
int bytes_read = read(pipe, pointer, BUFFER_LENGTH - (pointer - message));
if (0 == bytes_read)
{
*pointer = '[=12=]';
print_func(message);
break;
}
// add bytes remained from previous run
bytes_read += (pointer - message);
// copy complete characters to buffer to_print
int char_p = 0;
char* to_print_p = to_print;
int length;
do
{
unsigned char c = *(message + char_p);
if (0xc0 == (0xc0 & c))
{
length = 0;
while (0 != (0x80 & c))
{
c <<= 1;
++length;
}
if (char_p + length > bytes_read)
{
break;
}
}
else
{
length = 1;
}
for (int i = 0; i != length; ++i)
{
*(to_print_p++) = *(message + char_p++);
}
} while (char_p != bytes_read);
// finish buffer with complete characters and print it
*to_print_p = '[=12=]';
print_func(to_print);
// move tail to the beginning of the input buffer,
// pointer will point to the first free element in message buffer
pointer = message;
for (; char_p != bytes_read; ++char_p)
{
*(pointer++) = *(message + char_p);
}
} while (1);
}
int main()
{
read_pipe(STDIN_FILENO, print_function);
return 0;
}
int strlen_utf8(const char* s)
{
//h ttp://www.daemonology.net/blog/2008-06-05-faster-utf8-strlen.html
int i = 0, j = 0;
while (s[i])
{
if ((s[i] & 0xc0) != 0x80) j++;
i++;
}
return j;
}
void utf8_to_wchar_t(wchar_t * ws,const char* s)
{
//utf8--------------------------------------
//0xxxxxxx 1 byte
//110xxxxx 10xxxxxx 2 byte
//1110xxxx 10xxxxxx 10xxxxxx 3 byte
//11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 4 byte
int total = strlen_utf8(s);
unsigned char c =NULL;
wchar_t wc=NULL ; //unsigned int wc; !!!! we use 16 bit
int i = 0; //s[i]
int j = 0; //ws[j]
for(j=0;j<total;j++)
{
c = s[i++]; //read 1 byte first
if (c >> 7 == 0b0) //1 byte 0xxxxxxx
{
wc = (c & 0b01111111);
}
if (c >> 5 == 0b110) //2 byte 110xxxxx 10xxxxxx
{
wc = (c & 0b00011111) << 6;
wc += (s[i++]& 0b00111111);
}
if (c >> 4 == 0b1110) //3 byte 1110xxxx 10xxxxxx 10xxxxxx
{
wc = (c & 0b00001111) << 12;
wc += (s[i++] & 0b00111111) << 6;
wc += (s[i++] & 0b00111111);
}
if (c >> 3 == 0b11110) //4 byte 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
{
wc = (c & 0b00000111) << 18;
wc += (s[i++] & 0b00111111) << 12;
wc += (s[i++] & 0b00111111) << 6;
wc += (s[i++] & 0b00111111);
}
ws[j] = wc;
}
ws[total] = NULL;
}
void test()
{
char s[] = { 0xc5,0x9f,0xe2,0x98,0xba,0x00 };//test utf8
wchar_t ws[100];
utf8_to_wchar_t(ws, s);
//write 8bit
FILE* fp = fopen("a.txt", "wb");
fwrite(s, 1, 5, fp);
fclose(fp);
//write 16bit
FILE* fp2 = fopen("a2.txt", "wb");
fwrite("\xff\xfe", 1, 2, fp2); //little endian
fwrite(ws, 1, 4, fp2); fclose(fp2);
}
我有一个管道,其中写入了无穷无尽的字符串。这些字符串是 ASCII 和表情符号的混合体。我遇到的问题是我正在这样阅读它们
char msg[100];
int length = read(fd,&msg,99);
msg[length] =0;
但有时我猜测的表情符号是多字节的,它被切成两半,然后当我打印到屏幕上时,我得到菱形问号未知的 UTF-8 符号。
如果有人知道如何防止这种情况,请告诉我;我已经找了一段时间了。
如果您正在读取字节块,并希望输出 UTF-8 块,您必须自己至少进行一些最小的 UTF-8 解码。要检查的最简单条件是查看每个字节(我们称它为 b)并查看它是否是连续字节:
bool is_cont = (0x80 == (0xC0 & b));
任何不是延续的字节都开始一个序列,该序列一直持续到下一个非延续字节。您需要一个 4 字节的缓冲区来保存块。
下面的示例代码使用标准输入,但您可以取消注释 fdopen(fd, "r");
以改用 fd 管道。
这是一个超级简单的示例,说明如何执行此操作。它可能会慢一点,但我会先尝试一下,看看它是否满足您的需求。您还可以使用 fgetws()
.
下面的程序将读取 UTF8 字符并正确打印出来。
#include <stdio.h>
#include <stdlib.h>
#include <wchar.h>
#include <locale.h>
int main(void)
{
FILE *input_stream = stdin; //or fdopen(fd, "r");
FILE *output_stream = stdout;
setlocale(LC_ALL, "en_US.utf8");
fputws(L"Program started\n", output_stream); //note the wide string `L` prefix
wint_t wc;
while ((wc = fgetwc(input_stream)) != WEOF) {
//use CTRL+D to send WEOF to stdin
fputwc(wc, output_stream);
}
fputws(L"Program ended\n", output_stream); //note the wide string `L` prefix
//note that this example omits error handling for writing output and setlocale()
return EXIT_SUCCESS;
}
也可以与管道一起使用:
$ echo "hello. кошкâ" | ./a.out
Program started
hello. кошкâ
Program ended
lee-daniel-crocker 提供的提示可以很好地检查天气特定字节是否是 utf-8/utf-16 的一部分。
除此之外,您还需要添加更多逻辑。当您在流的末尾找到 utf-8 的部分序列时,您需要在流中回头查看(这里是缓冲区)以定位此部分序列的开始位置。
一旦找到此部分 utf-8 代码序列的开始位置,存储此部分代码,将其从缓冲区和处理缓冲区中删除。将该部分代码序列添加到下一个读取周期的缓冲区中。这将允许您将部分 utf-8 代码序列拆分为 read()
操作。
下面是用于测试和验证的示例代码。
App.c
// gcc -Wall app.c
#include <fcntl.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
volatile sig_atomic_t g_process_run = 1;
void signal_handler(int signal) { g_process_run = 0; }
int child_process(int *pipe) {
close(pipe[0]); // close read pipe
srand(1234);
int chars_to_send[] = {95, 97, 99, 100, 101, 103, 104, 105,
95, 97, 99, 100, 101, 103, 104, 105};
// int chars_to_send[] = {6, 7, 8, 9,12,14,15,16};
int fd = open("a.txt", O_RDONLY);
if (fd == -1) {
printf("Child: can't open file\n");
return -1;
}
struct stat sb;
if (fstat(fd, &sb) == -1) {
printf("Child: can't get file stat\n");
return -1;
}
off_t file_size = sb.st_size;
char *addr = mmap(NULL, file_size, PROT_READ, MAP_PRIVATE, fd, 0);
if (addr == MAP_FAILED) {
printf("Child:mmap failed");
return -1;
}
int start_address = 0;
while (g_process_run != 0) {
long index = rand();
index = (index * 16) / RAND_MAX;
int len = chars_to_send[index];
if (start_address + len > file_size) {
start_address = 0;
}
len = write(pipe[1], &addr[start_address], len);
start_address = start_address + len;
sleep(1);
}
munmap(addr, file_size);
close(fd);
close(pipe[1]);
printf("child process exiting\n");
return 0;
}
int parent_process(int *pipe) {
close(pipe[1]); // close write pipe
const int BUFF_SIZE = 99;
char buff[BUFF_SIZE + 1];
char buff_temp[10];
int continueCount = 0;
while (g_process_run != 0) {
int len = read(pipe[0], &buff[continueCount],
BUFF_SIZE - continueCount) +
continueCount; // addjust buffer position and size based
// on previous partial utf-8 sequence
continueCount = 0;
for (int i = len - 1; i > -1;
--i) { // find and save if last sequence are partial utf-8
if (0 != (0x80 & buff[i])) {
buff_temp[continueCount] = buff[i];
buff[i] = '[=10=]';
continueCount++;
} else {
break;
}
}
buff[len] = '[=10=]';
printf("Parent:%s\n", buff);
if (continueCount > 0) { // put partial utf-8 sequence to start of buffer,
// so it will prepend in next read cycle.
printf("will resume with %d partial bytes\n", continueCount);
for (int i = 0; i < continueCount; ++i) {
buff[i] = buff_temp[continueCount - i - 1];
}
}
}
close(pipe[0]);
wait(NULL);
printf("parent process exiting\n");
return 0;
}
int init_signal() {
if (signal(SIGINT, signal_handler) == SIG_ERR) {
return -1;
}
return 0;
}
int main(int argc, char **argv) {
if (init_signal() != 0)
return -1;
int pipefd[2];
if (pipe(pipefd) == -1) {
printf("can't create pipe\n");
return -1;
}
pid_t pid = fork();
if (pid == -1) {
printf("Can't fork process\n");
return -1;
} else if (pid == 0) { // child process
return child_process(pipefd);
}
return parent_process(pipefd);
}
a.txt
12abc️a23️s345️24ee️dai️iodqs️dqk️pdoo9️93wd️qd3️2om️de9️312abc️a23️s345️24ee️dai️iodqs️dqk️pdoo9️93wd️qd3️2om️de9️312abc️a23️s345️24ee️dai️iodqs️dqk️pdoo9️93wd️qd3️2om️de9️312abc️a23️s345️24ee️dai️iodqs️dqk️pdoo9️93wd️qd3️2om️de9️312abc️a23️s345️24ee️dai️iodqs️dqk️pdoo9️93wd️qd3️2om️de9️312abc️a23️s345️24ee️dai️iodqs️dqk️pdoo9️93wd️qd3️2om️de9️312abc️a23️s345️24ee️dai️iodqs️dqk️pdoo9️93wd️qd3️2om️de9️312abc️a23️s345️24ee️dai️iodqs️dqk️pdoo9️93wd️qd3️2om️de9️3
您可以找到此代码和测试文件 here。
我会选择类似的东西:
#include <stdio.h>
#include <unistd.h>
#define BUFFER_LENGTH 53
void print_function(char* message) {
// \r or 0x0d - UTF-8 carriage return
printf("%s\r", message);
}
void read_pipe(int pipe, void (*print_func)(char*))
{
char message[BUFFER_LENGTH];
char to_print[1 + BUFFER_LENGTH];
char* pointer = message;
do
{
int bytes_read = read(pipe, pointer, BUFFER_LENGTH - (pointer - message));
if (0 == bytes_read)
{
// print remaining bytes
*pointer = '[=10=]';
print_func(message);
break;
}
// add bytes remained from previous run
bytes_read += (pointer - message);
// copy complete characters to buffer to_print
int char_p = 0;
char* to_print_p = to_print;
for (int i = 0; i != bytes_read; ++i)
{
if (0x80 != (0xc0 & *(message + i)))
{
for (; char_p != i; ++char_p)
{
*(to_print_p++) = *(message + char_p);
}
}
}
// finish buffer with complete characters and print it
*to_print_p = '[=10=]';
print_func(to_print);
// move tail to the beginning of the input buffer,
// pointer will point to the first free element in message buffer
pointer = message;
for (; char_p != bytes_read; ++char_p)
{
*(pointer++) = *(message + char_p);
}
} while (1);
}
int main()
{
read_pipe(STDIN_FILENO, print_function);
return 0;
}
此处 read_pipe 从传递的 pipe
描述符中无限读取,并使用传递的 print_func
函数打印数据。
想法是从管道读取缓冲区,然后仅复制到打印缓冲区的完整字符(条件由 Lee Daniel Crocker 提供),假设存在有效的 UTF-8 序列。如果缓冲区有一些不完整的 UTF-8 字符的尾部,它将被用作下一部分数据的开头。所以我们循环直到管道结束。
为简单起见,我使用 stdin
作为管道描述符。至 运行 并测试:
gcc -Wall main.c -o run && perl -e 'print "\xf0\x9f\x98\xab"x1000;' > test.txt && ./run < test.txt > output.txt
P.S。另一种方法是获取字符长度,如下所述:UTF-8 Continuation bytes:
#include <stdio.h>
#include <unistd.h>
#define BUFFER_LENGTH 53
void print_function(char* message) {
// \r or 0x0d - UTF-8 carriage return
printf("%s\n", message);
}
void read_pipe(int pipe, void (*print_func)(char*))
{
char message[BUFFER_LENGTH];
char to_print[1 + BUFFER_LENGTH];
char* pointer = message;
do
{
int bytes_read = read(pipe, pointer, BUFFER_LENGTH - (pointer - message));
if (0 == bytes_read)
{
*pointer = '[=12=]';
print_func(message);
break;
}
// add bytes remained from previous run
bytes_read += (pointer - message);
// copy complete characters to buffer to_print
int char_p = 0;
char* to_print_p = to_print;
int length;
do
{
unsigned char c = *(message + char_p);
if (0xc0 == (0xc0 & c))
{
length = 0;
while (0 != (0x80 & c))
{
c <<= 1;
++length;
}
if (char_p + length > bytes_read)
{
break;
}
}
else
{
length = 1;
}
for (int i = 0; i != length; ++i)
{
*(to_print_p++) = *(message + char_p++);
}
} while (char_p != bytes_read);
// finish buffer with complete characters and print it
*to_print_p = '[=12=]';
print_func(to_print);
// move tail to the beginning of the input buffer,
// pointer will point to the first free element in message buffer
pointer = message;
for (; char_p != bytes_read; ++char_p)
{
*(pointer++) = *(message + char_p);
}
} while (1);
}
int main()
{
read_pipe(STDIN_FILENO, print_function);
return 0;
}
int strlen_utf8(const char* s)
{
//h ttp://www.daemonology.net/blog/2008-06-05-faster-utf8-strlen.html
int i = 0, j = 0;
while (s[i])
{
if ((s[i] & 0xc0) != 0x80) j++;
i++;
}
return j;
}
void utf8_to_wchar_t(wchar_t * ws,const char* s)
{
//utf8--------------------------------------
//0xxxxxxx 1 byte
//110xxxxx 10xxxxxx 2 byte
//1110xxxx 10xxxxxx 10xxxxxx 3 byte
//11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 4 byte
int total = strlen_utf8(s);
unsigned char c =NULL;
wchar_t wc=NULL ; //unsigned int wc; !!!! we use 16 bit
int i = 0; //s[i]
int j = 0; //ws[j]
for(j=0;j<total;j++)
{
c = s[i++]; //read 1 byte first
if (c >> 7 == 0b0) //1 byte 0xxxxxxx
{
wc = (c & 0b01111111);
}
if (c >> 5 == 0b110) //2 byte 110xxxxx 10xxxxxx
{
wc = (c & 0b00011111) << 6;
wc += (s[i++]& 0b00111111);
}
if (c >> 4 == 0b1110) //3 byte 1110xxxx 10xxxxxx 10xxxxxx
{
wc = (c & 0b00001111) << 12;
wc += (s[i++] & 0b00111111) << 6;
wc += (s[i++] & 0b00111111);
}
if (c >> 3 == 0b11110) //4 byte 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
{
wc = (c & 0b00000111) << 18;
wc += (s[i++] & 0b00111111) << 12;
wc += (s[i++] & 0b00111111) << 6;
wc += (s[i++] & 0b00111111);
}
ws[j] = wc;
}
ws[total] = NULL;
}
void test()
{
char s[] = { 0xc5,0x9f,0xe2,0x98,0xba,0x00 };//test utf8
wchar_t ws[100];
utf8_to_wchar_t(ws, s);
//write 8bit
FILE* fp = fopen("a.txt", "wb");
fwrite(s, 1, 5, fp);
fclose(fp);
//write 16bit
FILE* fp2 = fopen("a2.txt", "wb");
fwrite("\xff\xfe", 1, 2, fp2); //little endian
fwrite(ws, 1, 4, fp2); fclose(fp2);
}