diplodocus.org Git - nmh/blob - sbr/check_charset.c

   1 /* check_charset.c -- routines for character sets
   2  *
   3  * This code is Copyright (c) 2002, by the authors of nmh.  See the
   4  * COPYRIGHT file in the root directory of the nmh distribution for
   5  * complete copyright information.
   6  */
   7
   8 #include <h/mh.h>
   9
  10 #include <string.h>
  11 #include <langinfo.h>
  12
  13 static const char *norm_charmap(char *name);
  14
  15 /*
  16  * Get the current character set
  17  */
  18 char *
  19 get_charset(void)
  20 {
  21     return (char *)norm_charmap(nl_langinfo(CODESET));
  22 }
  23
  24
  25 /*
  26  * Check if we can display a given character set natively.
  27  * We are passed the length of the initial part of the
  28  * string to check, since we want to allow the name of the
  29  * character set to be a substring of a larger string.
  30  */
  31
  32 int
  33 check_charset (char *str, int len)
  34 {
  35     static char *mm_charset = NULL;
  36     static char *alt_charset = NULL;
  37     static int mm_len;
  38     static int alt_len;
  39
  40     /* Cache the name of our default character set */
  41     if (!mm_charset) {
  42         if (!(mm_charset = get_charset ()))
  43             mm_charset = "US-ASCII";
  44         mm_len = strlen (mm_charset);
  45
  46         /* US-ASCII is a subset of the ISO-8859-X and UTF-8 character sets */
  47         if (!strncasecmp("ISO-8859-", mm_charset, 9) ||
  48                 !strcasecmp("UTF-8", mm_charset)) {
  49             alt_charset = "US-ASCII";
  50             alt_len = strlen (alt_charset);
  51         }
  52     }
  53
  54     /* Check if character set is OK */
  55     if ((len == mm_len) && !strncasecmp(str, mm_charset, mm_len))
  56         return 1;
  57     if (alt_charset && (len == alt_len) && !strncasecmp(str, alt_charset, alt_len))
  58         return 1;
  59
  60     return 0;
  61 }
  62
  63
  64 /*
  65  * Return the name of the character set we are
  66  * using for 8bit text.
  67  */
  68 char *
  69 write_charset_8bit (void)
  70 {
  71     static char *mm_charset = NULL;
  72
  73     /*
  74      * Cache the name of the character set to
  75      * use for 8bit text.
  76      */
  77     if (!mm_charset && !(mm_charset = get_charset ()))
  78             mm_charset = "x-unknown";
  79
  80     return mm_charset;
  81 }
  82
  83 /* The Single Unix Specification function nl_langinfo(CODESET)
  84  * returns the name of the encoding used by the currently selected
  85  * locale:
  86  *
  87  *   http://www.opengroup.org/onlinepubs/7908799/xsh/langinfo.h.html
  88  *
  89  * Unfortunately the encoding names are not yet standardized.
  90  * This function knows about the encoding names used on many
  91  * different systems and converts them where possible into
  92  * the corresponding MIME charset name registered in
  93  *
  94  *   http://www.iana.org/assignments/character-sets
  95  *
  96  * Please extend it as needed and suggest improvements to the author.
  97  *
  98  * Markus.Kuhn@cl.cam.ac.uk -- 2002-03-11
  99  * Permission to use, copy, modify, and distribute this software
 100  * for any purpose and without fee is hereby granted. The author
 101  * disclaims all warranties with regard to this software.
 102  *
 103  * Latest version:
 104  *
 105  *   http://www.cl.cam.ac.uk/~mgk25/ucs/norm_charmap.c
 106  */
 107
 108 static const char *norm_charmap(char *name)
 109 {
 110     static const char *correct[] = {
 111         "UTF-8",
 112         "US-ASCII",
 113         NULL
 114     }, **cor;
 115     static struct {
 116         const char *alias;
 117         const char *name;
 118     } *ali, aliases[] = {
 119         /* Names for US-ASCII. */
 120         { "ANSI_X3.4-1968", "US-ASCII" }, /* LC_ALL=C. */
 121         { "ASCII", "US-ASCII" },
 122         { "646", "US-ASCII" },
 123         { "ISO646", "US-ASCII" },
 124         { "ISO_646.IRV", "US-ASCII" },
 125         /* Case differs. */
 126         { "BIG5", "Big5" },
 127         { "BIG5HKSCS", "Big5HKSCS" },
 128         /* Names for ISO-8859-11. */
 129         { "TIS-620", "ISO-8859-11" },
 130         { "TIS620.2533", "ISO-8859-11" },
 131         { NULL, NULL }
 132     };
 133     static struct {
 134         const char *substr;
 135         const char *name;
 136     } *sub, substrs[] = {
 137         { "8859-1", "ISO-8859-1" },
 138         { "8859-2", "ISO-8859-2" },
 139         { "8859-3", "ISO-8859-3" },
 140         { "8859-4", "ISO-8859-4" },
 141         { "8859-5", "ISO-8859-5" },
 142         { "8859-6", "ISO-8859-6" },
 143         { "8859-7", "ISO-8859-7" },
 144         { "8859-8", "ISO-8859-8" },
 145         { "8859-9", "ISO-8859-9" },
 146         { "8859-10", "ISO-8859-10" },
 147         { "8859-11", "ISO-8859-11" },
 148         /* 12, Latin/Devanagari, not completed. */
 149         { "8859-13", "ISO-8859-13" },
 150         { "8859-14", "ISO-8859-14" },
 151         { "8859-15", "ISO-8859-15" },
 152         { "8859-16", "ISO-8859-16" },
 153         { "CP1200", "WINDOWS-1200" },
 154         { "CP1201", "WINDOWS-1201" },
 155         { "CP1250", "WINDOWS-1250" },
 156         { "CP1251", "WINDOWS-1251" },
 157         { "CP1252", "WINDOWS-1252" },
 158         { "CP1253", "WINDOWS-1253" },
 159         { "CP1254", "WINDOWS-1254" },
 160         { "CP1255", "WINDOWS-1255" },
 161         { "CP1256", "WINDOWS-1256" },
 162         { "CP1257", "WINDOWS-1257" },
 163         { "CP1258", "WINDOWS-1258" },
 164         { NULL, NULL }
 165     };
 166
 167     if (!name)
 168         return name;
 169
 170     /* Avoid lots of tests for common correct names. */
 171     for (cor = correct; *cor; cor++)
 172         if (!strcmp(name, *cor))
 173             return name;
 174
 175     for (ali = aliases; ali->alias; ali++)
 176         if (!strcmp(name, ali->alias))
 177             return ali->name;
 178
 179     for (sub = substrs; sub->substr; sub++)
 180         if (strstr(name, sub->substr))
 181             return sub->name;
 182
 183     return name;
 184 }