#!/usr/local/bin/ruby # # Masatoshi SEKI # $Id: kreg.rb,v 1.4 1999/03/24 13:25:25 mas Exp $ # Copyright (C) 1998-1999 Masatoshi SEKI # # Netnewsで見つけた H.Okumura さんの C のコードを移植しました。 # 自由に書き換えてお使い下さい。 # module Kreg KANA = [ 0xA1A3,0xA1D6,0xA1D7,0xA1A2,0xA1A6,0xA5F2,0xA5A1,0xA5A3, 0xA5A5,0xA5A7,0xA5A9,0xA5E3,0xA5E5,0xA5E7,0xA5C3,0xA1BC, 0xA5A2,0xA5A4,0xA5A6,0xA5A8,0xA5AA,0xA5AB,0xA5AD,0xA5AF, 0xA5B1,0xA5B3,0xA5B5,0xA5B7,0xA5B9,0xA5BB,0xA5BD,0xA5BF, 0xA5C1,0xA5C4,0xA5C6,0xA5C8,0xA5CA,0xA5CB,0xA5CC,0xA5CD, 0xA5CE,0xA5CF,0xA5D2,0xA5D5,0xA5D8,0xA5DB,0xA5DE,0xA5DF, 0xA5E0,0xA5E1,0xA5E2,0xA5E4,0xA5E6,0xA5E8,0xA5E9,0xA5EA, 0xA5EB,0xA5EC,0xA5ED,0xA5EF,0xA5F3,0xA1AB,0xA1AC ] DAKUTEN = [ 1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,0,1,0,1,0,1,0, 0,0,0,0,0,3,0,0,3,0,0,3,0,0,3,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1, 0,0,1,0,1,0,1,0,0,0,0,0,0,3,0,0,3,0,0,3,0,0,3,0,0,3 ] KIGOU = { # 記号とギリシア文字 0xA1AA => '!', 0xA1C9 => '"', #" 0xA1F4 => '#', 0xA1F0 => '$', 0xA1F3 => '%', 0xA1F5 => '&', 0xA1C7 => '\'', 0xA1CA => '(', 0xA1CB => ')', 0xA1F6 => '*', 0xA1DC => '+', 0xA1A4 => ',', 0xA1DD => '-', 0xA1A5 => '.', 0xA1BF => '/', 0xA1A7 => ':', 0xA1A8 => ',', 0xA1E3 => '<', 0xA1E1 => '=', 0xA1E4 => '>', 0xA1A9 => '?', 0xA1F7 => '@', 0xA1CE => '[', 0xA1EF => '\\', 0xA1CF => ']', 0xA1B0 => '^', 0xA1B2 => '_', 0xA3E0 => '`', #` 0xA1D0 => '{', 0xA1C3 => '|', 0xA1D1 => '}', 0xA1B1 => '~', 0xA6A1 => 'A', 0xA6A2 => 'B', 0xA6A5 => 'E', 0xA6A6 => 'Z', 0xA6A7 => 'H', 0xA6A9 => 'I', 0xA6AA => 'K', 0xA6AC => 'M', 0xA6AD => 'N', 0xA6AF => 'O', 0xA6B1 => 'P', 0xA6B3 => 'T', 0xA6B6 => 'X' } def regularize(src) dest = '' # 結果 d = 0 # 直前に読んだ文字 src.scan(/./).each do |str| if str.size == 1 ch = 0 c = str[0] else ch = str[0] c = str[0] * 256 + str[1] end case(ch) when 0x8E # 半角カナ c = self::KANA[c - 0x8EA1] # 全角カナに when 0xA3 # 全角英数かな if ((c >= 0xA3B0) and (c <= 0xA3B9)) # 0−9 c -= 0xA3B0 - '0'[0] # 0-9 elsif ((c >= 0xA3C1) and (c <= 0xA3DA)) # A-Z c -= 0xA3C1 - 'A'[0] # A-Z elsif ((c >= 0xA3E1) and (c <= 0xA3FA)) # a-z c -= 0xA3E1 - 'a'[0] # a-z end when 0xA1, 0xA3, 0xA6 # 記号、ギリシア文字 kigou = self::KIGOU.fetch(c, nil) c = self.s2i(kigou) if kigou end case (c) when 0xA1A1 # 空白 c = ' '[0] when 0xA1BC # - c = '-'[0] if d < 0x0100 when 0xA1AB if ((d >= 0xA4AB) and (d <= 0xA5DB)) if (self::DAKUTEN[d - 0xA4AB] & 1) # 濁音 c = d + 1 d = 0 elsif d == 0xA5A6 # ウ c = 0xA5F4 # ヴ d = 0 end end when 0xA1AC if ((d >= 0xA4CF) and (d <= 0xA5DB)) # 半濁音 if (self::DAKUTEN[d - 0xA4AB] & 2) c = d + 2 d = 0 end end end dest += self.i2s(d) d = c end dest += self.i2s(d) end module_function :regularize def s2i (str) case str.size when 1 i = str[0] when 2 i = str[0] * 0x100 + str[2] else i = 0 end i end module_function :s2i private :s2i def i2s (num) if num == 0 s = '' elsif num >= 0x100 s = ' ' s[0] = (num >> 8) s[1] = (num & 0xFF) else s = ' ' s[0] = num end s end module_function :i2s private :i2s end if __FILE__ == $0 while src = gets src.chomp! print src, "\n" print Kreg::regularize(src), "\n" end end =begin /*********************************************************** reg.c -- 文書の正規化 H.Okumura ************************************************************ EUC文書の正規化をします。 ・半角カナを全角カナにします。 ・全角アルファベットを半角にします。 ・例えば「か゛」を「が」に直します。 ・半角文字の直後の「ー」を「-」に直します。 使用法: reg outfile または reg infile1 infile2 ... >outfile 自由に書き換えてお使い下さい。 ***********************************************************/ #include /* 0xA1-0xDF の半角文字を全角に変換する表 */ /* 。「」、・ヲァィゥェォャュョッーアイウエオカキクケコサシスセソ */ /* タチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワン゛゜ */ unsigned kana[] = { 0xA1A3,0xA1D6,0xA1D7,0xA1A2,0xA1A6,0xA5F2,0xA5A1,0xA5A3, 0xA5A5,0xA5A7,0xA5A9,0xA5E3,0xA5E5,0xA5E7,0xA5C3,0xA1BC, 0xA5A2,0xA5A4,0xA5A6,0xA5A8,0xA5AA,0xA5AB,0xA5AD,0xA5AF, 0xA5B1,0xA5B3,0xA5B5,0xA5B7,0xA5B9,0xA5BB,0xA5BD,0xA5BF, 0xA5C1,0xA5C4,0xA5C6,0xA5C8,0xA5CA,0xA5CB,0xA5CC,0xA5CD, 0xA5CE,0xA5CF,0xA5D2,0xA5D5,0xA5D8,0xA5DB,0xA5DE,0xA5DF, 0xA5E0,0xA5E1,0xA5E2,0xA5E4,0xA5E6,0xA5E8,0xA5E9,0xA5EA, 0xA5EB,0xA5EC,0xA5ED,0xA5EF,0xA5F3,0xA1AB,0xA1AC}; /* 0xA4AB-0xA5DB で濁点のつくもの(1) + 半濁点のつくもの(2) */ unsigned char dakuten[] = { 1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,0,1,0,1,0,1,0, 0,0,0,0,0,3,0,0,3,0,0,3,0,0,3,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1, 0,0,1,0,1,0,1,0,0,0,0,0,0,3,0,0,3,0,0,3,0,0,3,0,0,3}; unsigned lastc = '\n'; /* 最後に出力した文字 */ void put(unsigned c) /* 1文字出力 (行末の空白を除く) */ { static unsigned spaces = 0; if (c == ' ') spaces++; else { if (spaces != 0) { if (c != '\n') while (spaces--) putchar(' '); /* まとめて空白出力 */ spaces = 0; } if (c >> 8) putchar(c >> 8); if (c & 0xFF) putchar(c & 0xFF); lastc = c; } } void regularize(FILE *fp) /* ファイル fp を正規化 */ { unsigned c, d, t, ch; d = 0; /* 一つ前に読んだ字 (1〜2バイト) */ c = getc(fp); while ((int)c != EOF) { if (c >= 0xA1) { /* 漢字 */ t = getc(fp); if (c == 0xFF || t < 0xA1 || t == 0xFF) /* 不正 */ c = 0xA1A1; else c = (c << 8) + t; } else if (c == 0x8E) { /* 半角カナ */ t = getc(fp); if (t < 0xA1 || t > 0xDF) /* 不正 */ c = 0x20; else c = 0x8E00 + t; } ch = c >> 8; if (ch == 0) { /* 8ビット文字 */ /* do nothing */ } else if (ch == 0x8E) { /* 半角カナ */ c = kana[c - 0x8EA1]; /* 全角カナに */ } else if (ch == 0xA3) { /* 全角英数かな */ if (c >= 0xA3B0 && c <= 0xA3B9) { /* 0-9 */ c -= 0xA3B0 - '0'; /* 0-9 */ } else if (c >= 0xA3C1 && c <= 0xA3DA) { /* A-Z */ c -= 0xA3C1 - 'A'; /* A-Z */ } else if (c >= 0xA3E1 && c <= 0xA3FA) { /* a-z */ c -= 0xA3E1 - 'a'; /* a-z */ } } else if (ch == 0xA6) { if (c >= 0xA6A1 && c <= 0xA6B6) { /* ギリシア文字 */ if (c == 0xA6A1) c = 'A'; else if (c == 0xA6A2) c = 'B'; else if (c == 0xA6A5) c = 'E'; else if (c == 0xA6A6) c = 'Z'; else if (c == 0xA6A7) c = 'H'; else if (c == 0xA6A9) c = 'I'; else if (c == 0xA6AA) c = 'K'; else if (c == 0xA6AC) c = 'M'; else if (c == 0xA6AD) c = 'N'; else if (c == 0xA6AF) c = 'O'; else if (c == 0xA6B1) c = 'P'; else if (c == 0xA6B3) c = 'T'; else if (c == 0xA6B6) c = 'X'; } } /* seki 追加分 */ if (c ==0xA1AA) c = '!'; else if (c ==0xA1C9) c = '"'; else if (c ==0xA1F4) c = '#'; else if (c ==0xA1F0) c = '$'; else if (c ==0xA1F3) c = '%'; else if (c ==0xA1F5) c = '&'; else if (c ==0xA1C7) c = '\''; else if (c ==0xA1CA) c = '('; else if (c ==0xA1CB) c = ')'; else if (c ==0xA1F6) c = '*'; else if (c ==0xA1DC) c = '+'; else if (c ==0xA1A4) c = ','; else if (c ==0xA1DD) c = '-'; else if (c ==0xA1A5) c = '.'; else if (c ==0xA1BF) c = '/'; else if (c ==0xA1A7) c = ':'; else if (c ==0xA1A8) c = ';'; else if (c ==0xA1E3) c = '<'; else if (c ==0xA1E1) c = '='; else if (c ==0xA1E4) c = '>'; else if (c ==0xA1A9) c = '?'; else if (c ==0xA1F7) c = '@'; else if (c ==0xA1CE) c = '['; else if (c ==0xA1EF) c = '\\'; else if (c ==0xA1CF) c = ']'; else if (c ==0xA1B0) c = '^'; else if (c ==0xA1B2) c = '_'; else if (c ==0xA3E0) c = '`'; else if (c ==0xA1D0) c = '{'; else if (c ==0xA1C3) c = '|'; else if (c ==0xA1D1) c = '}'; else if (c ==0xA1B1) c = '~'; /* 一つ前の文字に依存する処理 */ if (c == 0xA1A1) { /* 全角空白は */ #if 0 if (d < 0x8F00) #endif c = ' '; /* 半角空白に */ } else if (c == 0xA1BC) { /* ー */ if (d < 0x0100) c = '-'; /* 半角直後は - に */ } else if (c == 0xA1AB) { /* ゛ */ if (d >= 0xA4AB && d <= 0xA5DB) { if (dakuten[d - 0xA4AB] & 1) { c = d + 1; d = 0; /* 1 を足せば濁音になる */ } else if (d == 0xA5A6) { /* ウ */ c = 0xA5F4; d = 0; /* ヴ */ } } } else if (c == 0xA1AC) { /* ゜ */ if (d >= 0xA4CF && d <= 0xA5DB) { if (dakuten[d - 0xA4AB] & 2) { c = d + 2; d = 0; /* 2 を足せば半濁音になる */ } } } /* 一つ前の文字の出力 */ put(d); d = c; c = getc(fp); } /* 残った文字の出力 (最後の \n がなければ補う) */ put(d); if (lastc != '\n') put('\n'); } int main(int argc, char *argv[]) { int i; FILE *fp; if (argc == 1) regularize(stdin); else { for (i = 1; i < argc; i++) { fp = fopen(argv[i], "r"); if (fp == NULL) { fputs("Can't open ", stderr); fputs(argv[i], stderr); fputc('\n', stderr); } else regularize(fp); } } return 0; } =end