--- file-5.05/src/Makefile.am.vinejtext 2010-07-22 00:56:10.000000000 +0900 +++ file-5.05/src/Makefile.am 2011-02-11 16:53:06.000000000 +0900 @@ -4,11 +4,11 @@ bin_PROGRAMS = file -AM_CPPFLAGS = -DMAGIC='"$(MAGIC)"' +AM_CPPFLAGS = -DMAGIC='"$(MAGIC)"' -DDETECT_JAPANESE AM_CFLAGS = @WARNINGS@ libmagic_la_SOURCES = magic.c apprentice.c softmagic.c ascmagic.c \ - encoding.c compress.c is_tar.c readelf.c print.c fsmagic.c \ + encoding.c compress.c is_tar.c readelf.c print.c jcode.c fsmagic.c \ funcs.c file.h names.h patchlevel.h readelf.h tar.h apptype.c \ file_opts.h elfclass.h mygetopt.h cdf.c cdf_time.c readcdf.c cdf.h libmagic_la_LDFLAGS = -no-undefined -version-info 1:0:0 --- file-5.05/src/encoding.c.vinejtext 2010-07-22 01:47:17.000000000 +0900 +++ file-5.05/src/encoding.c 2011-02-11 17:26:00.000000000 +0900 @@ -42,7 +42,7 @@ FILE_RCSID("@(#)$File: encoding.c,v 1.5 #include <string.h> #include <memory.h> #include <stdlib.h> - +#include "jcode.h" private int looks_ascii(const unsigned char *, size_t, unichar *, size_t *); private int looks_utf8_with_BOM(const unsigned char *, size_t, unichar *, @@ -68,7 +68,7 @@ protected int file_encoding(struct magic_set *ms, const unsigned char *buf, size_t nbytes, unichar **ubuf, size_t *ulen, const char **code, const char **code_mime, const char **type) { size_t mlen; - int rv = 1, ucs_type; + int rv = 1, ucs_type, jcode; unsigned char *nbuf = NULL; mlen = (nbytes + 1) * sizeof(nbuf[0]); @@ -83,10 +83,27 @@ file_encoding(struct magic_set *ms, cons } *type = "text"; - if (looks_ascii(buf, nbytes, *ubuf, ulen)) { + jcode = detect_kcode(buf, nbytes, *ubuf, ulen); + if (jcode == ASCII) { DPRINTF(("ascii %" SIZE_T_FORMAT "u\n", *ulen)); *code = "ASCII"; *code_mime = "us-ascii"; + } else if (jcode == JIS) { + DPRINTF(("jis %" SIZE_T_FORMAT "u\n", *ulen)); + *code = "7-bit JIS [ESC$B, ESC(B]"; + *code_mime = "jis"; + } else if (jcode == SJIS){ + DPRINTF(("sjis %" SIZE_T_FORMAT "u\n", *ulen)); + *code = "SJIS"; + *code_mime = "sjis"; + } else if (jcode == EUC){ + DPRINTF(("euc %" SIZE_T_FORMAT "u\n", *ulen)); + *code = "EUC"; + *code_mime = "euc-jp"; + } else if (jcode == EUCORSJIS){ + DPRINTF(("euc or sjis %" SIZE_T_FORMAT "u\n", *ulen)); + *code = "EUC or SJIS"; + *code_mime = "unknown"; } else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) { DPRINTF(("utf8/bom %" SIZE_T_FORMAT "u\n", *ulen)); *code = "UTF-8 Unicode (with BOM)"; --- /dev/null 2011-02-06 21:11:58.373999997 +0900 +++ file-5.05/src/jcode.c 2011-02-11 17:14:29.000000000 +0900 @@ -0,0 +1,205 @@ +/* +jcode.c: Kanji-code detect routing by Jun Nishii <jun@vinelinux.org> + modified by Ryoichi INAGAKI <inagaki@vinelinux.org> + */ +#include <stdio.h> +#include <unistd.h> +#include <signal.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <jcode.h> + +typedef unsigned long unichar; + +#define F 0 /* character never appears in text */ +#define T 1 /* character appears in plain ASCII text */ +#define I 2 /* character appears in ISO-8859 text */ +#define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */ +#define J 4 /* character appears in JIS or plain ASCII */ +#define S 5 /* character appears in SJIS */ +#define E 6 /* character appears in EUC */ +#define O 7 /* character appears in EUC or SJIS */ + +#define ESC 27 + +static char jp_chars1[256] = { + F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */ + F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */ + T, J, J, J, J, J, J, J, J, J, J, J, J, J, J, J, /* 0x2X */ + J, J, J, J, J, J, J, J, J, J, J, J, J, J, J, J, /* 0x3X */ + J, J, J, J, J, J, J, J, J, J, J, J, J, J, J, J, /* 0x4X */ + J, J, J, J, J, J, J, J, J, J, J, J, J, J, J, J, /* 0x5X */ + J, J, J, J, J, J, J, J, J, J, J, J, J, J, J, J, /* 0x6X */ + J, J, J, J, J, J, J, J, J, J, J, J, J, J, J, F, /* 0x7X */ + /* NEL */ + X, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, /* 0x8X */ + S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, /* 0x9X */ + I, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, /* 0xaX */ + E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, /* 0xbX */ + E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, /* 0xcX */ + E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, /* 0xdX */ + O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, /* 0xeX */ + E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, I /* 0xfX */ +}; + +static char jp_chars2[256] = { + /* BEL BS HT LF FF CR */ + F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */ + /* ESC */ + F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */ + T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */ + T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */ + S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, /* 0x4X */ + S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, /* 0x5X */ + S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, /* 0x6X */ + S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, F, /* 0x7X */ + /* NEL */ + S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, /* 0x8X */ + S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, /* 0x9X */ + S, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, /* 0xaX */ + O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, /* 0xbX */ + O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, /* 0xcX */ + O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, /* 0xdX */ + O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, /* 0xeX */ + O, O, O, O, O, O, O, O, O, O, O, O, O, E, E, I /* 0xfX */ +}; + + +int +check_asc_jis(buf, nbytes, ubuf, ulen) + const unsigned char *buf; + size_t nbytes; + unichar *ubuf; + size_t *ulen; +{ + size_t i; + int jflag; + + *ulen = 0; jflag=0; + + for (i = 0; i < nbytes; i++) { + int t = jp_chars1[buf[i]]; + + if (t != T && t != J ) + return 0; + + if (buf[i] == ESC && i+2<nbytes) { + if ((buf[i+1]=='$' && buf[i+2]=='B')|| + (buf[i+1]=='$' && buf[i+2]=='@')) jflag=1; + } + + ubuf[(*ulen)++] = buf[i]; + } + + if (jflag==1) return JIS; + else return ASCII; +} + +int +check_sjis(buf, nbytes, ubuf, ulen) + const unsigned char *buf; + size_t nbytes; + unichar *ubuf; + size_t *ulen; +{ + size_t i; + int jflag; + + *ulen = 0; + jflag = ASCII; + for (i = 0; i < nbytes; i++) { + int t = jp_chars1[buf[i]]; + + if (t != T && t != J && t != S && t!= O) + return 0; + + if (t == S && i<nbytes-1){ + ubuf[(*ulen)++] = buf[i]; + ++i; + t=jp_chars2[buf[i]]; + if(t != S && t != O ) return 0; + jflag=SJIS; + } else if (t == O && i<nbytes-1){ + ubuf[(*ulen)++] = buf[i]; + ++i; + t=jp_chars2[buf[i]]; + if( t == S ){ jflag=SJIS; } + else if( t == O ){ if(jflag==ASCII) jflag=EUCORSJIS; } + else return 0; + } + + ubuf[(*ulen)++] = buf[i]; + } +#ifdef Z + if (jflag==SJIS) {ckfputs("SJIS text", stdout); return SJIS;} + if (jflag==EUCORSJIS) {ckfputs("EUCorSJIS", stdout); return EUCORSJIS;} +#else + if (jflag==SJIS) {return SJIS;} + if (jflag==EUCORSJIS) {return EUCORSJIS;} +#endif +} + +int +check_euc(buf, nbytes, ubuf, ulen) + const unsigned char *buf; + size_t nbytes; + unichar *ubuf; + size_t *ulen; +{ + size_t i; + int jflag; + + *ulen = 0; + jflag = ASCII; + + for (i = 0; i < nbytes; i++) { + int t = jp_chars1[buf[i]]; + + if (t != T && t != J && t != E && t!= O) + return 0; + + if (t == E && i<nbytes-1){ + ubuf[(*ulen)++] = buf[i]; + ++i; + t= jp_chars2[buf[i]]; + if( t != E && t != O) return 0; + jflag=EUC; + } else if (t == O && i<nbytes-1){ + ubuf[(*ulen)++] = buf[i]; + ++i; + t=jp_chars2[buf[i]]; + if( t == E ){ jflag=EUC; } + else if( t == O ){ if(jflag==ASCII) jflag=EUCORSJIS; } + else return 0; + } + + ubuf[(*ulen)++] = buf[i]; + } +#ifdef Z + if (jflag==EUC) {ckfputs("EUC text", stdout); return EUC;} + if (jflag==EUCORSJIS) {ckfputs("EUCorSJIS", stdout); return EUCORSJIS;} +#else + if (jflag==EUC) { return EUC;} + if (jflag==EUCORSJIS) {return EUCORSJIS;} +#endif +} + +int +detect_kcode(buf, nbytes, ubuf, ulen) + const unsigned char *buf; + size_t nbytes; + unichar *ubuf; + size_t *ulen; +{ + int ret; + ret=check_asc_jis(buf, nbytes, ubuf, ulen); + if(ret==ASCII) return ASCII; + if(ret==JIS) return JIS; + + ret=check_sjis(buf, nbytes, ubuf, ulen); + if(ret==SJIS) return SJIS; + if(ret==EUCORSJIS) return EUCORSJIS; + ret=check_euc(buf, nbytes, ubuf, ulen); + if(ret==EUC) return EUC; + if(ret==EUCORSJIS) return EUCORSJIS; +} --- /dev/null 2011-02-06 21:11:58.373999997 +0900 +++ file-5.05/src/jcode.h 2011-02-11 17:12:11.000000000 +0900 @@ -0,0 +1,15 @@ +/* + jcode.h - for jcode.c by Jun Nishii <jun@vinelinux.org> + modified by Ryoichi INAGAKI <inagaki@vinelinux.org> + */ + +#define ASCII 1 +#define JIS 2 +#define EUC 3 +#define SJIS 4 +#define EUCORSJIS 5 + +extern int detect_kcode (const unsigned char *, size_t, unichar *, size_t *); +extern int looks_jis (const unsigned char *, size_t, unichar *, size_t *); +extern int looks_sjis (const unsigned char *, size_t, unichar *, size_t *); +extern int looks_euc (const unsigned char *, size_t, unichar *, size_t *); --- file-5.05/src/names.h.vinejtext 2010-10-09 06:58:44.000000000 +0900 +++ file-5.05/src/names.h 2011-02-11 17:28:18.000000000 +0900 @@ -135,8 +135,6 @@ {"/*", L_C, 2 }, /* must precede "The", "the", etc. */ {"#include", L_C, 2 }, {"char", L_C, 2 }, - {"The", L_ENG, 2 }, - {"the", L_ENG, 2 }, {"double", L_C, 1 }, {"extern", L_C, 2 }, {"float", L_C, 1 },