Unicode into Character Encoding

<..functions ..>+

static  INTEGER put_4ht_ch(ch,htFile)    int ch ; FILE* htFile
;{
                                  int c;
   c = ch;
   if( ch==’&’ ){
      <.flush incomplete unicode .>
      if( put_4ht_off ){
         c = putc( ch, htFile );
      } else {
         uni_code[0] = ’&’;
         uni_code_p = 1;
         put_4ht_file = htFile;
      }
   } else
   if( uni_code_p ){
     if( ch == ’;’ ){ <.process unicode .> uni_code_p = 0; }
     else if (     ((uni_code_p+1) == MAX_UNI_CODE)
                ||
                   (    ((ch<’0’) || (ch>’9’))
                     && ((ch<’a’) || (ch>’f’))
                     && ((ch<’A’) || (ch>’F’))
                     && (ch!=’#’)
                     && (ch!=’x’)
                     && (ch!=’X’)
                   )
             )
     { <.flush incomplete unicode .>
       c = putc( ch, htFile );
     } else { uni_code[ uni_code_p++ ] = ch; }
   } else { c = putc( ch, htFile ); }
   return  c;
}
-_-_-

<..vars ..>+
static FILE* put_4ht_file = (FILE *) 0;
static int put_4ht_off = 1;
static char uni_code[MAX_UNI_CODE];
static short uni_code_p = 0;
-_-_-

<..on/off unicode ..>
special_n--;
switch ( code = get_char() ){
   case ’+’: { put_4ht_off++; <.flush incomplete unicode .> break; }
   case ’-’: { if( put_4ht_off>0 ){ put_4ht_off--; }
               else { warn_i_str(52, "@u-"); }
               break; }
}
-_-_-

16.2 Flash Unrecognized Codes

<..functions ..>+
static void flush_uni( MYVOID )
{
                                    int i;
   for(  i=0; i<uni_code_p; i++ ){
     (IGNORED)  putc( uni_code[i], put_4ht_file );
   }
   uni_code_p = 0;
   put_4ht_file = (FILE *) 0;
}
-_-_-

16.3 Load unicode.4hf Table

<..unicode.4hf vars ..>
int chr, delimiter, delimiter_n, line_no, digit, i, j;
U_CHAR in[512], *in_p, * start[4], *p;
BOOL char_on, err;
int value;
-_-_-

<..read unicode.4hf ..>
err = FALSE;
line_no = 0;
while( TRUE ){
   line_no++;
   chr = (int) getc(file);
   if( chr == EOF ){ break; }
   if( (chr>32) && (chr<127) ){
      <.scan 4hf fields .>
      if( delimiter_n == 8 ){
         if( *in != ’?’ ) {
            if( <.not hexa unicode?.> ){ err = TRUE; }
            else {
               <.value = hex into int .>
               if( start[3] == (in_p-1) ){
                  if( !err ){ <.store type-less 4hf endtry .> }
               } else { <.store typed 4hf endtry .> }
      }  } }
      else { err = TRUE; }
      <.error 4hf fields .>
   }
   while( (chr != EOF) && (chr!=’\n’) ){
      chr = (int) getc(file);
   }
   if( chr == EOF ){ break; }
}
-_-_-

<..not hexa unicode?..>
    (*in             != ’&’)
|| (*(in+1)         != ’#’)
|| ( (*(in+2)       != ’x’) && (*(in+2) != ’X’))
|| (*(start[1] - 2) != ’;’)
-_-_-

<..scan 4hf fields ..>
delimiter   = chr;
delimiter_n = 1;
char_on     = TRUE;
in_p = in;
while( TRUE ) {
   chr = (int) getc(file);
   if( (chr == EOF) || (chr==’\n’) ){ break; }
   if( chr == delimiter ){
      if( char_on ){ *(in_p++) = ’\0’; }
      else{ start[ delimiter_n/2 ] = in_p; }
      char_on = !char_on;
      delimiter_n++;
   } else if (char_on ) {
      *(in_p++) = chr;
   }
   if( delimiter_n==8 ){ break; }
}
-_-_-

<..error 4hf fields ..>
if( err ){
    warn_i_int(48,line_no);
    (IGNORED) printf( "%c", delimiter );
    for( p=in; p != in_p; p++ ){
      if( *p==’\0’ ){
        (IGNORED) printf("%c", delimiter);
        if( p != in_p-1 ){ (IGNORED) printf("  %c", delimiter); }
      }
      else { (IGNORED) printf( "%c", *p ); }
    }
    (IGNORED) printf( "\n" );
    err = FALSE;
}
-_-_-

16.4 Store Entry of Table

<..vars ..>+
static int charset_n = 0, max_charset_n;
static struct charset_rec *charset;
-_-_-

<..mem for charset ..>
max_charset_n = 256;
charset = m_alloc(struct charset_rec, 256);
-_-_-

<..value = hex into int ..>
value = 0;
for( p=in+3; *p!=’;’; p++){
   digit = (int) *p;
   if( (digit>=’0’) && (digit<=’9’) ){ digit -= ’0’; }
   else if( (digit>=’A’) && (digit<=’F’) ){ digit -= BASE_A; }
   else if( (digit>=’a’) && (digit<=’f’) ){ digit -= BASE_a; }
   else { digit=0; err = TRUE; }
   value = 16*value + digit;
}
-_-_-

<..store type-less 4hf endtry ..>
<.mem for new 4ht entry .>
p = m_alloc(char, (int) (start[3] - start[2]) );
(IGNORED) strcpy((char *) p, (char *) start[2] );
i = charset_n;
while( i-- > 0 ){
   if( charset[i].ch == value ){
      free((void *) charset[i].str);
      break;
   } else {
      if(   (charset[i].ch < value)
         || ((charset[i].ch > value) && (i==0)) ){
         if( charset[i].ch < value ){ i++; }
         charset_n++;
         for( j=charset_n; j>i; j-- ){
            charset[j].ch  = charset[j-1].ch;
            charset[j].str = charset[j-1].str;
         }
         break;
   }  }
}
if(i == -1){ i = charset_n; }
if( i==charset_n ){ charset_n++; }
charset[i].str = p;
charset[i].ch  = value;
-_-_-

<..mem for new 4ht entry ..>
if( (charset_n+1) == max_charset_n){
   max_charset_n += 10;
   charset = (struct charset_rec *) r_alloc((void *) charset,
         (size_t) ((max_charset_n) * sizeof(struct charset_rec) ));
}
-_-_-

16.5 Use Unicode Substitution

<..process unicode ..>
if( uni_code[1] != ’#’ ){
    <.flush incomplete unicode .>
    (IGNORED)  putc( ch, htFile );
}
else{
       int i, base, value, digit;
   if( (uni_code[2] == ’x’) || (uni_code[2] == ’X’) ){
      base =16; i=3;
   } else { base=10; i=2; }
   value = 0;
   for( ; i<uni_code_p; i++ ){
     digit = uni_code[i];
     if( (digit>=’0’) && (digit<=’9’) ){  digit -= ’0’; }
     else if( (digit>=’A’) && (digit<=’F’) ){  digit -= BASE_A; }
     else if( (digit>=’a’) && (digit<=’f’) ){  digit -= BASE_a; }
     else { value = -1; break; }
     if( digit >= base ){ value=-1; break; }
     value = value*base + digit;
   }
   if( value<0 ){ <.flush incomplete unicode .>
                  (IGNORED)  putc( ch, htFile );
   } else {
      <.search 4hf table .>
} }
-_-_-

<..search 4hf table ..>
     int bottom, mid, top;
     BOOL found=FALSE;
bottom = 0; top = charset_n;
while( !found ){
    mid = (bottom + top) / 2;
    if( value == charset[mid].ch ){
       <.put 4hf replacement .>
       found = TRUE;
    } else if( value < charset[mid].ch ){
       if( bottom == top ){ break; }
       top = mid;
    }
    else {
      if ( bottom < mid ){  bottom = mid; }
      else if ( bottom<top ){ bottom++; }
      else{ break; }
    }
}
if( ! found ){
    if( u10 || utf8 ){ <.hex uni to base 10 or utf8 .> }
    <.flush incomplete unicode .>
    if( !utf8 ){ (IGNORED) putc( ch, htFile ); }
}
-_-_-

<..put 4hf replacement ..>
{          U_CHAR *p;
    p = charset[mid].str;
    while( *p != ’\0’ ){
      if( *p==’\\’ ){
        p++;
        if( *p==’\\’ ){
          (IGNORED) putc( ’\\’, htFile );
        } else {
              int i;
          i = *p - ’0’;
          while( *(++p) != ’\\’ ){ i = 10*i + *p - ’0’; }
          (IGNORED) putc( i, htFile );
      } }
      else {
        (IGNORED) putc( *p, htFile );
        if ( (*p==’&’) && u10 ){ <.u10 for 4hf replacement .> }
      }
      p++;
}  }
-_-_-

16.6 Base 10 for Entity Codes

<..hex uni to base 10 or utf8 ..>
       short  n;
       long   dec;
       int    ch;
       char   uni_10[MAX_UNI_CODE];
if( (uni_code[2] == ’x’) || (uni_code[2] == ’X’) ) {
    dec = 0;
    for(  n=3; n<uni_code_p; n++ ){
       ch = uni_code[n];
       dec = 16*dec +
              ((ch > ’9’)?
                          ( 10 + ((ch > ’Z’)? (ch-’a’) : (ch-’A’)) )
                        : (ch-’0’));
    }
    if( u10 ){ <.dec to u10 .> }
    else     { <.uni in utf8 .> }
}
-_-_-

<..dec to u10 ..>
    if( dec == 0 ){
       uni_code_p = 3;  uni_code[2] = ’0’;
    } else {
       n = 0;
       while( dec > 0 ){  uni_10[ n++ ] = dec % 10 + ’0’;   dec /= 10;  }
       uni_code_p = 2;
       while( n>0 ){  uni_code[ uni_code_p++ ] = uni_10[ --n ]; }
    }
-_-_-

<..u10 for 4hf replacement ..>
if ( *(p+1) == ’#’ ){
   p++;
   (IGNORED) putc( ’#’, htFile );
   if ( (*(p+1) == ’x’) || (*(p+1) == ’X’) ){
                  int value, digit;
                  U_CHAR *q;
      q = p+2;
      value = 0;
      digit = *(q++);
      while( digit!=0 ){
        if( (digit>=’0’) && (digit<=’9’) ){
           value = value*16 + digit - ’0’;
        }
        else if( (digit>=’A’) && (digit<=’F’) ){
           value = value*16 + digit - ’A’+10;
        }
        else if( (digit>=’a’) && (digit<=’f’) ){
           value = value*16 + digit - ’a’+10; }
        else {
          if( digit == ’;’ ){
            <.display value in u10 .>
            p=q-2;
          }
          break;
        }
        digit = *(q++);
      }
} }
-_-_-

<..display value in u10 ..>
               char   uni_10[MAX_UNI_CODE];
               int n;
n = 0;
while( value>0 ){
   uni_10[ n++ ] = value % 10 + ’0’;
   value /= 10;
}
while( n>0 ){
    (IGNORED) putc(  uni_10[--n], htFile );
}
-_-_-

16.7 UTF-8 for Entity Codes

U-00000000 - U-0000007F:  0xxxxxxx
U-00000080 - U-000007FF:  110xxxxx 10xxxxxx
U-00000800 - U-0000FFFF:  1110xxxx 10xxxxxx 10xxxxxx
U-00010000 - U-001FFFFF:  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
U-00200000 - U-03FFFFFF:  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
U-04000000 - U-7FFFFFFF:  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

<..dec to utf8 ..>
if( dec < 0x80 ){
    uni_code_p = 1;  uni_code[0] = dec;
}
else if( dec < 0x800 ){
    uni_code_p = 2;
    uni_code[0] = (dec >> 6)           | 0xC0;
    uni_code[1] = (dec & 0x3F)         | 0x80;
}
else if( dec < 0x10000 ){
    uni_code_p = 3;
    uni_code[0] = (dec >> 12)          | 0xE0;
    uni_code[1] = ((dec >> 6)  & 0x3F) | 0x80;
    uni_code[2] =  (dec        & 0x3F) | 0x80;
}
else if( dec < 0x200000 ){
    uni_code_p = 4;
    uni_code[0] = (dec >> 18)          | 0xF0;
    uni_code[1] = ((dec >> 12) & 0x3F) | 0x80;
    uni_code[2] = ((dec >>  6) & 0x3F) | 0x80;
    uni_code[3] =  (dec        & 0x3F) | 0x80;
}
else if( dec < 0x4000000 ){
    uni_code_p = 5;
    uni_code[0] = (dec >> 24)          | 0xF8;
    uni_code[1] = ((dec >> 18) & 0x3F) | 0x80;
    uni_code[2] = ((dec >> 12) & 0x3F) | 0x80;
    uni_code[3] = ((dec >>  6) & 0x3F) | 0x80;
    uni_code[4] =  (dec        & 0x3F) | 0x80;
}
else if( dec <= 0x7FFFFFFF ){
    uni_code_p = 6;
    uni_code[0] = (dec >> 30)          | 0xFC;
    uni_code[1] = ((dec >> 24) & 0x3F) | 0x80;
    uni_code[2] = ((dec >> 18) & 0x3F) | 0x80;
    uni_code[3] = ((dec >> 12) & 0x3F) | 0x80;
    uni_code[4] = ((dec >>  6) & 0x3F) | 0x80;
    uni_code[5] =  (dec        & 0x3F) | 0x80;
}
-_-_-

<..uni in utf8 ..>
<.dec to utf8 .>
else {
    n = 0;
    while( dec > 0 ){  uni_10[ n++ ] = dec % 10 + ’0’;   dec /= 10;  }
    uni_code_p = 2;
    while( n>0 ){  uni_code[ uni_code_p++ ] = uni_10[ --n ]; }
}
-_-_-

16.8 Replacements for HTF

<..store typed 4hf endtry ..>
<.mem for new htf-4hf entry .>
(IGNORED) strcpy((char *) p, (char *) start[2] );
i = htf_4hf_n;
while( i-- > 0 ){
   if( htf_4hf[i].ch == value ){
      free((void *) htf_4hf[i].str);
      break;
   } else {
      if(   (htf_4hf[i].ch < value)
         || ((htf_4hf[i].ch > value) && (i==0)) ){
         if( htf_4hf[i].ch < value ){ i++; }
         htf_4hf_n++;
         for( j=htf_4hf_n; j>i; j-- ){
            htf_4hf[j].ch = htf_4hf[j-1].ch;
            htf_4hf[j].str = htf_4hf[j-1].str;
            htf_4hf[j].type1  = htf_4hf[j-1].type1;
            htf_4hf[j].type2  = htf_4hf[j-1].type2;
         }
         break;
}  } }
if(i == -1){ i = htf_4hf_n; }
if(i == htf_4hf_n){ htf_4hf_n++; }
htf_4hf[i].str = p;
htf_4hf[i].ch  = value;
<.htf_4hf[i].type1 = ....>
<.htf_4hf[i].type2 = ....>
-_-_-

<..end loading fonts ..>+
for( i = 0; i<htf_4hf_n; i++){
free((void *) htf_4hf[i].str);
}
free((void *) htf_4hf);
-_-_-

<..mem for new htf-4hf entry ..>
if( (htf_4hf_n+1) == max_htf_4hf_n){
   max_htf_4hf_n += 10;
   htf_4hf = (struct htf_4hf_rec *) r_alloc((void *) htf_4hf,
         (size_t) ((max_htf_4hf_n) * sizeof(struct htf_4hf_rec) ));
}
p = m_alloc(char, (int) (start[3] - start[2]) );
-_-_-

<..htf_4hf[i].type1 = .....>
value = 0;
p = start[1];
while( *p != ’\0’ ){
    if( (*p < ’0’) || (*p > ’9’) ) break;
    value = value * 10 + *p - ’0’;
    p++;
}
htf_4hf[i].type1  =  value;
-_-_-

<..htf_4hf[i].type1 = .....>+
value = 0;
p = start[3];
while( *p != ’\0’ ){
    if( (*p < ’0’) || (*p > ’9’) ) break;
    value = value * 10 + *p - ’0’;
    p++;
}
htf_4hf[i].type2  =  value;
-_-_-

<..vars ..>+
static int htf_4hf_n = 0, max_htf_4hf_n;
static struct htf_4hf_rec *htf_4hf;
-_-_-

<..mem for charset ..>+
max_htf_4hf_n = 256;
htf_4hf = m_alloc(struct htf_4hf_rec, 256);
-_-_-

16.9 Propagate Changes into the HTF Fonts

<..propagate 4hf info into htf ..>
if(
        (*str             == ’&’)
     && (*(str+1)         == ’#’)
     && ( (*(str+2)       == ’x’) || (*(str+2) == ’X’))
     && (*(str + strlen((char *) str) - 1) == ’;’)
) {
         char* p;
         int   value = 0;
         BOOL  err = FALSE;
     for( p=str+3; *p!=’;’; p++){
       int digit = (int) *p;
       if( (digit>=’0’) && (digit<=’9’) ){ digit -= ’0’; }
       else if( (digit>=’A’) && (digit<=’F’) ){ digit -= BASE_A; }
       else if( (digit>=’a’) && (digit<=’f’) ){ digit -= BASE_a; }
       else { digit=0; err = TRUE; }
       value = 16*value + digit;
     }
     if( !err ){
       <.search 4hf replacement in htf-4hf .>
       <.search 4hf replacement in charset .>
}   }
-_-_-

<..htf replacement from htf-4hf ..>
if( htf_4hf[mid].type1 == ch1  ){
    ch1 = htf_4hf[mid].type2;
    (IGNORED) strcpy((char *) str, (char *) htf_4hf[mid].str );
}
-_-_-

<..htf replacement from charset ..>
if( charset[mid].type1 == ch1  ){
    ch1 = charset[mid].type2;
    (IGNORED) strcpy((char *) str, (char *) charset.str );
}
-_-_-

<..search 4hf replacement in htf-4hf ..>
     int bottom, mid, top;
     BOOL found=FALSE;
bottom = 0; top = htf_4hf_n;
while( !found ){
    mid = (bottom + top) / 2;
    if( value == htf_4hf[mid].ch ){
       <.htf replacement from htf-4hf .>
       found = TRUE;
    } else if( value < htf_4hf[mid].ch ){
       if( bottom == top ){ break; }
       top = mid;
    }
    else {
      if ( bottom < mid ){  bottom = mid; }
      else if ( bottom<top ){ bottom++; }
      else{ break; }
    }
}
-_-_-

<..htf replacement from charset ..>+
bottom = 0; top = charset_n;
while( !found ){
    mid = (bottom + top) / 2;
    if( value == charset[mid].ch ){
       <.htf into 4hf .>
       found = TRUE;
    } else if( value < charset[mid].ch ){
       if( bottom == top ){ break; }
       top = mid;
    }
    else {
      if ( bottom < mid ){  bottom = mid; }
      else if ( bottom<top ){ bottom++; }
      else{ break; }
    }
}
-_-_-

Chapter 16
Unicode into Character Encoding

16.1 Put Character

16.2 Flash Unrecognized Codes

16.3 Load unicode.4hf Table

16.4 Store Entry of Table

16.5 Use Unicode Substitution

16.6 Base 10 for Entity Codes

16.7 UTF-8 for Entity Codes

16.8 Replacements for HTF

16.9 Propagate Changes into the HTF Fonts

Chapter 16Unicode into Character Encoding

16.1 Put Character

16.2 Flash Unrecognized Codes

16.3 Load unicode.4hf Table

16.4 Store Entry of Table

16.5 Use Unicode Substitution

16.6 Base 10 for Entity Codes

16.7 UTF-8 for Entity Codes

16.8 Replacements for HTF

16.9 Propagate Changes into the HTF Fonts

Chapter 16
Unicode into Character Encoding