#include const wchar_t cp437uni[128]={ /* 0x80..0xFF */ 0x00C7,0x00FC,0x00E9,0x00E2,0x00E4,0x00E0,0x00E5,0x00E7, 0x00EA,0x00EB,0x00E8,0x00EF,0x00EE,0x00EC,0x00C4,0x00C5, 0x00C9,0x00E6,0x00C6,0x00F4,0x00F6,0x00F2,0x00FB,0x00F9, 0x00FF,0x00D6,0x00DC,0x00A2,0x00A3,0x00A5,0x20A7,0x0192, 0x00E1,0x00ED,0x00F3,0x00FA,0x00F1,0x00D1,0x00AA,0x00BA, 0x00BF,0x2310,0x00AC,0x00BD,0x00BC,0x00A1,0x00AB,0x00BB, 0x2591,0x2592,0x2593,0x2502,0x2524,0x2561,0x2562,0x2556, 0x2555,0x2563,0x2551,0x2557,0x255D,0x255C,0x255B,0x2510, 0x2514,0x2534,0x252C,0x251C,0x2500,0x253C,0x255E,0x255F, 0x255A,0x2554,0x2569,0x2566,0x2560,0x2550,0x256C,0x2567, 0x2568,0x2564,0x2565,0x2559,0x2558,0x2552,0x2553,0x256B, 0x256A,0x2518,0x250C,0x2588,0x2584,0x258C,0x2590,0x2580, 0x03B1,0x00DF,0x0393,0x03C0,0x03A3,0x03C3,0x03BC,0x03C4, 0x03A6,0x0398,0x03A9,0x03B4,0x221E,0x03C6,0x03B5,0x2229, 0x2261,0x00B1,0x2265,0x2264,0x2320,0x2321,0x00F7,0x2248, 0x00B0,0x2219,0x00B7,0x221A,0x207F,0x00B2,0x25A0,0x00A0}; const wchar_t cp1252uni[32]={ /* 0x80..0x9F */ 0x20AC,0x0020,0x201A,0x0192,0x201E,0x2026,0x2020,0x2021, 0x02C6,0x2030,0x0160,0x2039,0x0152,0x0020,0x017D,0x0020, 0x0020,0x2018,0x2019,0x201C,0x201D,0x2022,0x2013,0x2014, 0x02DC,0x2122,0x0161,0x203A,0x0153,0x0020,0x017E,0x0178}; static void put_utf8(wchar_t c) { /* Code <0x10000 <0x800 <0x80 1. Byte 1110 xxxx 110x xxxx 0xxx xxxx 2. Byte 10xx xxxx 10xx xxxx 3. Byte 10xx xxxx */ if (c<0x80) { putchar(c); }else if (c<0x800) { putchar((c>>6)|0xC0); putchar(c&0x3F|0x80); }else{ putchar((c>>12)|0xE0); putchar((c>>6)&0x3F|0x80); putchar(c&0x3F|0x80); } } typedef enum {false,true} bool; /* Assuming a two-byte ot three-byte UTF-8 composition here! */ static bool isutf8(int ch, int ct) { if (ct< 0x80) return false; if (ct>=0xC0) return false; if (ch< 0xC0) return false; if (ch>=0xE0) { if (ch>=0xF0) return false; ct = (ct&0x3F)<<6 | (ch&0x0F)<<12; /* build wchar_t */ if (ct<0x800) return false; }else{ ct = ct&0x3F | (ch&0x1F)<<6; /* build wchar_t */ if (ct< 0xA0) return false; } return true; } /* with no command-line argument, convert CP437 to UTF-8, with one (dummy) command-line argument, put this program to a "guess mode", with two (dummy) command-line arguments, inverse guessing reaction, with three (dummy) command-line arguments, convert CP1252 ("ANSI") to UTF-8 returns values: -3 conversion from UCS16 to UTF8 (Windows UCS16 header stripped) -1 Windows UTF-8 header stripped 0 UTF-8 input detected and not converted 1 OEM2UTF converted 2 plain ASCII (GUESS mode) 3 plain ASCII (inverse GUESS mode) 4 ANSI2UTF converted*/ int main(int argc) { int c; c=getchar(); if (c==0xEF) { /* Windows UTF8 header */ getchar(); /* simply skip 0xBB */ getchar(); /* simply skip 0xBF */ argc=-1; /* don't convert */ }else if (c==0xFF) { /* Windows UCS16 header */ getchar(); /* simply skip 0xFE */ argc=-3; /* convert special */ }else goto inloop; for (;;) { c=getchar(); inloop: if (c==EOF) break; if (argc<-1) { /* UCS16 Intel */ c|=getchar()<<8; }else if (argc>0) { if (c>=0x80) { if (argc==2 || argc==3) { /* any guess mode */ int c2=getchar(); if (isutf8(c,c2)) argc=0; /* switch to UTF-8 mode */ if (c2!=EOF) ungetc(c2,stdin); } if (argc==2) { if (c>=0xA6 && c!=0xE1) argc=4; /* ANSI, force convert */ else argc=1; /* OEM, force convert */ }else if (argc==3) { if (c>=0xA6 && c!=0xE1) argc=1; else argc=4; } if (argc==1) c=cp437uni[c-0x80]; if (argc==4 && c<0xA0) c=cp1252uni[c-0x80]; } } if (argc && argc!=-1) put_utf8(c); else putchar(c); } return argc; }