文章详情

  • 游戏榜单
  • 软件榜单
关闭导航
热搜榜
热门下载
热门标签
php爱好者> php文档>搜狗scel词库解析

搜狗scel词库解析

时间:2010-09-06  来源:bigluo

从ubuntu.org.cn上下载的,刚刚测试了下,能够转换最新的sogou词库(http://pinyin.sogou.com/dict/). 把源代码贴上:


#include <stdio.h>
#include <string.h>
#include <malloc.h>
#include <memory.h>

typedef struct PY_
{
        unsigned short mark;
        char py[6+1];
        struct PY_ * next;
} PY;

int unicode2utf8char(unsigned short in, unsigned char * out)
{
        if (in >= 0x0000 && in <= 0x007f)
        {
            *out=in;
            return 0;
        }
        else if (in >= 0x0080 && in <= 0x07ff)
        {
            *out = 0xc0 | (in >> 6);
            out ++;
            *out = 0x80 | (in & (0xff >> 2));
            return 0;
        }
        else if (in >= 0x0800 && in <= 0xffff)
        {
            *out = 0xe0 | (in >> 12);
            out ++;
            *out = 0x80 | (in >> 6 & 0x003f);
            out ++;
            *out = 0x80 | (in & (0xff >> 2));
            return 0;
        }
    printf("输入的不是short吧,解析有问题\n");
    return 0;
}

int unicode2utf8str(char * in, int insize,unsigned char * out)
{
        unsigned char str[16]={0};
        unsigned short tmp[insize/2];
        int i;

        *out='\0';
        memcpy(tmp,in,insize);

        for( i=0;i<insize/2;i++)
        {
                memset(str,0,sizeof(str));
                unicode2utf8char(tmp[i],str);
                strcat(out,str);
        }
        return 0;
}


PY * loadPY(FILE * fp)
{
        unsigned char str[128]={0};
        unsigned char outstr[128]={0};
        unsigned short num[16]={0};
        int i;
        PY * head=NULL;
        PY * p=NULL;

        fseek(fp, 0x1540, SEEK_SET);
        fgets(str,4+1,fp);

        if(memcmp(str,"\x9D\x01\x00\x00",4) != 0)
        {
                printf("莫非解析位置有误?\n");
                //return -1;

        }

        head=(PY *)malloc(sizeof(PY));
        head->next=NULL;
        p=head;
        while(1)
        {
                memset(str,0,sizeof(str));
                memset(num,0,sizeof(num));
                for(i=0;i<4;i++)
                {
                        str[i]=fgetc(fp);
                }
                memcpy(num,str,4);

                p->next=(PY *)malloc(sizeof(PY));
                p=p->next;
                p->mark=num[0];

                memset(str,0,sizeof(str));
                fgets(str,num[1]+1,fp);
                unicode2utf8str(str,64,p->py);

                p->next=NULL;
                if( strcmp(p->py,"zuo" ) == 0)
                {
                        return head;
                        break;
                }
        }
}

int creatWordStock(FILE *fp,PY * head)
{
        unsigned char str[256]={0};
        unsigned char outstr[256]={0};
        unsigned char pybuf[128]={0};
        unsigned char hzbuf[128]={0};
        unsigned char buf[256]={0};
        PY *p =NULL;
        FILE * newfp;
        unsigned short num[64]={0};
        int i,count,offset;

        newfp=fopen("sg_pyPhrase.org","w+");
        if( newfp == NULL)
        {
                perror("fopen error");
                return -1;
        }

        fseek(fp, 0x2628, SEEK_SET);
        while(1)
        {
                count=0;
                offset=0;
                p=head->next;
                memset(num,0,sizeof(num));
                memset(str,0,sizeof(str));
                memset(pybuf,0,sizeof(pybuf));
                memset(hzbuf,0,sizeof(hzbuf));
                memset(buf,0,sizeof(buf));

                for(i=0;i<4;i++)
                {
                        str[i]=fgetc(fp);
                        if( feof(fp) )
                        {
                                fclose(newfp);
                                return 0;
                        }
                }

                memcpy(num,str,4);
                offset=num[0]-1;
                count=num[1];
                memset(str,0,sizeof(str));
                for(i=0;i<count;i++)
                {
                        str[i]=fgetc(fp);
                        if( feof(fp) )
                        {
                                fclose(newfp);
                                return 0;
                        }
                }
                memset(num,0,sizeof(num));
                memcpy(num,str,count);

                for(i=0;i<count/2;i++)
                {
                        p=head->next;
                        while(p!=NULL)
                        {
                                if( p->mark == num[i])
                                {
                                        strcat(pybuf,p->py);
                                        strcat(pybuf,"'");
                                        p=NULL;
                                        break;
                                }
                                p=p->next;
                        }
                }
                if( pybuf[strlen(pybuf)-1] == '\'' )
                        pybuf[strlen(pybuf)-1] = '\0';

                memset(num,0,sizeof(num));
                memcpy(num,str,count);
                for(i=0;i<2;i++)
                {
                        str[i]=fgetc(fp);
                        if( feof(fp) )
                        {
                                fclose(newfp);
                                return 0;
                        }
                }
                memcpy(num,str,count);
                count=num[0];

                memset(num,0,sizeof(num));
                memcpy(num,str,count);
                for(i=0;i<count;i++)
                {
                        str[i]=fgetc(fp);
                        if( feof(fp) )
                        {
                                fclose(newfp);
                                return 0;
                        }
                }
                unicode2utf8str(str,64,hzbuf);
                sprintf(buf,"%s %s",pybuf,hzbuf);
                fprintf(newfp,"%s\n",buf);
                for(i=0;i<(12+offset*(12+count+2));i++)
                {
                        str[i]=fgetc(fp);
                        if( feof(fp) )
                        {
                                fclose(newfp);
                                return 0;
                        }
                }
        }
        return 0;
}

void freePY(PY * head)
{
        PY * p;
        p=head;
        if( p->next !=NULL)
        {
                head=p;
                p=p->next;
                free(head);
        }
}


int main(int argc ,char * argv[])
{
        FILE * fp=NULL;
        unsigned char str[128]={0};
        unsigned char outstr[128]={0};
        PY * head;
        int i;

        if(argc <=1)
        {
                printf("请输入sg词库文件!");
                return 0;
        }
        fp=fopen(argv[1],"r");
        if( fp == NULL)
        {
                perror("fopen error");
                return -1;
        }
        fgets(str,8+1,fp);
        if( memcmp(str,"\x40\x15\x00\x00\x44\x43\x53\x01",8))
        {
                printf("你确认你选择的是搜狗(.scel)词库?\n");
                return 0;
        }
        memset(str,0,sizeof(str));

        fseek(fp, 0x130, SEEK_SET);
        fgets(str,64+1,fp);
        unicode2utf8str(str,64,outstr);
        printf("字库名称:%s\n",outstr);

        memset(str,0,sizeof(str));
        memset(outstr,0,sizeof(outstr));
        fseek(fp, 0x338, SEEK_SET);
        fgets(str,64+1,fp);
        unicode2utf8str(str,64,outstr);
        printf("字库类别:%s\n",outstr);

        memset(str,0,sizeof(str));
        memset(outstr,0,sizeof(outstr));
        fseek(fp, 0x540, SEEK_SET);
        fgets(str,64+1,fp);
        unicode2utf8str(str,64,outstr);
        printf("字库信息:%s\n",outstr);

        memset(str,0,sizeof(str));
        memset(outstr,0,sizeof(outstr));
        fseek(fp, 0xd40, SEEK_SET);
        fgets(str,64+1,fp);

        unicode2utf8str(str,64,outstr);
        printf("字库示例:%s\n",outstr);

        head=loadPY(fp);
        creatWordStock(fp,head);
        freePY(head);

        fclose(fp);
    return 0;
}


相关阅读 更多 +
排行榜 更多 +
辰域智控app

辰域智控app

系统工具 下载
网医联盟app

网医联盟app

运动健身 下载
汇丰汇选App

汇丰汇选App

金融理财 下载