Tag Archives: 中文

新的轉碼方法 , libiconv , BIG5 轉 UTF8

這段 code , 很好用, 就貼在這兒了, 下次可以直接剪下跟貼上

重點有兩個:

1. From encoding 是 CP950

2. libiconv 不要用 libc 的, 因為缺一個 function : iconvctl

int fnConvert(const char *from, const char *to, char* save, int savelen, char *src, int srclen)
{
    iconv_t cd;
    char   *inbuf = src;
    char *outbuf = save;
    size_t outbufsize = savelen;
    int status = 0;
    size_t  savesize = 0;
    size_t inbufsize = srclen+1;
    char* inptr = inbuf;
    size_t      insize = inbufsize;
    char* outptr = outbuf;
    size_t outsize = outbufsize;

    if ( ( cd = iconv_open(to, from) ) == (iconv_t)-1 )
    {
        status = -1;
        goto done;
    }

    iconv(cd,NULL,NULL,NULL,NULL);
    if (inbufsize == 0)
    {
        status = -1;
        goto done;
    }
    while (insize > 0)
    {
        size_t res = iconv(cd, &inptr,&insize,&outptr,&outsize);
        if (outptr != outbuf)
        {
            int saved_errno = errno;
            int outsize = outptr - outbuf;
            strncpy(save+savesize, outbuf, outsize);
            errno = saved_errno;
        }
        if (res == (size_t)(-1))
        {
            if (errno == EILSEQ)
            {
                int one = 1;
                iconvctl(cd,ICONV_SET_DISCARD_ILSEQ,&one);
                status = -3;
            }
            else if (errno == EINVAL)
            {
                if (inbufsize == 0)
                {
                    status = -4;
                    goto done;
                }
                else
                {
                    break;
                }
            }
            else if (errno == E2BIG)
            {
                status = -5;
                goto done;
            }
            else
            {
                status = -6;
                goto done;
            }
        }
    }
    status = strlen(save);
done:
    iconv_close(cd);
    return status;
}

EXAMPLE:
int fnB2U( unsigned char *str )
{
    int srclen, destlen, status;
    unsigned char tmp[2048];

    status = -1;
    if ( ( srclen = strlen( str ) ) == 0 )
        return status;
    destlen  = sizeof( unsigned char ) * ( srclen * 2 ) + 1;
    tmp[0] = '';
    if ( ( status = fnConvert ( "CP950", "UTF-8", tmp, destlen, str, srclen) ) > 0 )
        strcpy( str, tmp);
    else
        printf( "status = %dn");
    return( status);
}

中文 字詞 文章 索引 對照規畫

tbl_term

term_id term
1 ipod
2 apple
3 nano
4 iphone

tbl_doc

doc_id text_content uptime
1 【蘋果先生】Moshi iGlaze Apple iPhone 3G 專用保護殼 黑/紅/白
2 iTunes Gift Card蘋果線上音樂商店預付卡儲值Apple iTunes Shop Ipod Nano Shuffle iphone Mp3電影歌曲下載促銷
3 Apple iPhone 3G (8G)  

tbl_lookup

id term_id doc_id
1 2 1
2 4 1
3 2 2
4 1 2
5 3 2
6 4 2
7 2 3
8 4 3