2012-03-22

Java HTML (un)escaper

The code was from google-api-translate-java


import java.util.Hashtable;

/**
 * Collection of static methods to convert special and extended
 * characters into HTML entitities and vice versa.
 * Copyright (c) 2004-2005 Tecnick.com S.r.l (www.tecnick.com) Via Ugo Foscolo
 * n.19 - 09045 Quartu Sant'Elena (CA) - ITALY - www.tecnick.com -
 * info@tecnick.com
 * License: http://www.gnu.org/copyleft/lesser.html LGPL
 * @author Nicola Asuni [www.tecnick.com].
 * @version 1.0.004
 */
public class HTMLEntities {
  
  /**
   * Translation table for HTML entities.
   * reference: W3C - Character entity references in HTML 4 [http://www.w3.org/TR/html401/sgml/entities.html].
   */
  private static final Object[][] html_entities_table = {
    { new String("Á"), new Integer(193) },
    { new String("á"), new Integer(225) },
    { new String("Â"), new Integer(194) },
    { new String("â"), new Integer(226) },
    { new String("´"), new Integer(180) },
    { new String("Æ"), new Integer(198) },
    { new String("æ"), new Integer(230) },
    { new String("À"), new Integer(192) },
    { new String("à"), new Integer(224) },
    { new String("ℵ"), new Integer(8501) },
    { new String("Α"), new Integer(913) },
    { new String("α"), new Integer(945) },
    { new String("&"), new Integer(38) },
    { new String("∧"), new Integer(8743) },
    { new String("∠"), new Integer(8736) },
    { new String("Å"), new Integer(197) },
    { new String("å"), new Integer(229) },
    { new String("≈"), new Integer(8776) },
    { new String("Ã"), new Integer(195) },
    { new String("ã"), new Integer(227) },
    { new String("Ä"), new Integer(196) },
    { new String("ä"), new Integer(228) },
    { new String("„"), new Integer(8222) },
    { new String("Β"), new Integer(914) },
    { new String("β"), new Integer(946) },
    { new String("¦"), new Integer(166) },
    { new String("•"), new Integer(8226) },
    { new String("∩"), new Integer(8745) },
    { new String("Ç"), new Integer(199) },
    { new String("ç"), new Integer(231) },
    { new String("¸"), new Integer(184) },
    { new String("¢"), new Integer(162) },
    { new String("Χ"), new Integer(935) },
    { new String("χ"), new Integer(967) },
    { new String("ˆ"), new Integer(710) },
    { new String("♣"), new Integer(9827) },
    { new String("≅"), new Integer(8773) },
    { new String("©"), new Integer(169) },
    { new String("↵"), new Integer(8629) },
    { new String("∪"), new Integer(8746) },
    { new String("¤"), new Integer(164) },
    { new String("†"), new Integer(8224) },
    { new String("‡"), new Integer(8225) },
    { new String("↓"), new Integer(8595) },
    { new String("⇓"), new Integer(8659) },
    { new String("°"), new Integer(176) },
    { new String("Δ"), new Integer(916) },
    { new String("δ"), new Integer(948) },
    { new String("♦"), new Integer(9830) },
    { new String("÷"), new Integer(247) },
    { new String("É"), new Integer(201) },
    { new String("é"), new Integer(233) },
    { new String("Ê"), new Integer(202) },
    { new String("ê"), new Integer(234) },
    { new String("È"), new Integer(200) },
    { new String("è"), new Integer(232) },
    { new String("∅"), new Integer(8709) },
    { new String(" "), new Integer(8195) },
    { new String(" "), new Integer(8194) },
    { new String("Ε"), new Integer(917) },
    { new String("ε"), new Integer(949) },
    { new String("≡"), new Integer(8801) },
    { new String("Η"), new Integer(919) },
    { new String("η"), new Integer(951) },
    { new String("Ð"), new Integer(208) },
    { new String("ð"), new Integer(240) },
    { new String("Ë"), new Integer(203) },
    { new String("ë"), new Integer(235) },
    { new String("€"), new Integer(8364) },
    { new String("∃"), new Integer(8707) },
    { new String("ƒ"), new Integer(402) },
    { new String("∀"), new Integer(8704) },
    { new String("½"), new Integer(189) },
    { new String("¼"), new Integer(188) },
    { new String("¾"), new Integer(190) },
    { new String("⁄"), new Integer(8260) },
    { new String("Γ"), new Integer(915) },
    { new String("γ"), new Integer(947) },
    { new String("≥"), new Integer(8805) },
    { new String("↔"), new Integer(8596) },
    { new String("⇔"), new Integer(8660) },
    { new String("♥"), new Integer(9829) },
    { new String("…"), new Integer(8230) },
    { new String("Í"), new Integer(205) },
    { new String("í"), new Integer(237) },
    { new String("Î"), new Integer(206) },
    { new String("î"), new Integer(238) },
    { new String("¡"), new Integer(161) },
    { new String("Ì"), new Integer(204) },
    { new String("ì"), new Integer(236) },
    { new String("ℑ"), new Integer(8465) },
    { new String("∞"), new Integer(8734) },
    { new String("∫"), new Integer(8747) },
    { new String("Ι"), new Integer(921) },
    { new String("ι"), new Integer(953) },
    { new String("¿"), new Integer(191) },
    { new String("∈"), new Integer(8712) },
    { new String("Ï"), new Integer(207) },
    { new String("ï"), new Integer(239) },
    { new String("Κ"), new Integer(922) },
    { new String("κ"), new Integer(954) },
    { new String("Λ"), new Integer(923) },
    { new String("λ"), new Integer(955) },
    { new String("⟨"), new Integer(9001) },
    { new String("«"), new Integer(171) },
    { new String("←"), new Integer(8592) },
    { new String("⇐"), new Integer(8656) },
    { new String("⌈"), new Integer(8968) },
    { new String("“"), new Integer(8220) },
    { new String("≤"), new Integer(8804) },
    { new String("⌊"), new Integer(8970) },
    { new String("∗"), new Integer(8727) },
    { new String("◊"), new Integer(9674) },
    { new String("‎"), new Integer(8206) },
    { new String("‹"), new Integer(8249) },
    { new String("‘"), new Integer(8216) },
    { new String("¯"), new Integer(175) },
    { new String("—"), new Integer(8212) },
    { new String("µ"), new Integer(181) },
    { new String("·"), new Integer(183) },
    { new String("−"), new Integer(8722) },
    { new String("Μ"), new Integer(924) },
    { new String("μ"), new Integer(956) },
    { new String("∇"), new Integer(8711) },
    { new String(" "), new Integer(160) },
    { new String("–"), new Integer(8211) },
    { new String("≠"), new Integer(8800) },
    { new String("∋"), new Integer(8715) },
    { new String("¬"), new Integer(172) },
    { new String("∉"), new Integer(8713) },
    { new String("⊄"), new Integer(8836) },
    { new String("Ñ"), new Integer(209) },
    { new String("ñ"), new Integer(241) },
    { new String("Ν"), new Integer(925) },
    { new String("ν"), new Integer(957) },
    { new String("Ó"), new Integer(211) },
    { new String("ó"), new Integer(243) },
    { new String("Ô"), new Integer(212) },
    { new String("ô"), new Integer(244) },
    { new String("Œ"), new Integer(338) },
    { new String("œ"), new Integer(339) },
    { new String("Ò"), new Integer(210) },
    { new String("ò"), new Integer(242) },
    { new String("‾"), new Integer(8254) },
    { new String("Ω"), new Integer(937) },
    { new String("ω"), new Integer(969) },
    { new String("Ο"), new Integer(927) },
    { new String("ο"), new Integer(959) },
    { new String("⊕"), new Integer(8853) },
    { new String("∨"), new Integer(8744) },
    { new String("ª"), new Integer(170) },
    { new String("º"), new Integer(186) },
    { new String("Ø"), new Integer(216) },
    { new String("ø"), new Integer(248) },
    { new String("Õ"), new Integer(213) },
    { new String("õ"), new Integer(245) },
    { new String("⊗"), new Integer(8855) },
    { new String("Ö"), new Integer(214) },
    { new String("ö"), new Integer(246) },
    { new String("¶"), new Integer(182) },
    { new String("∂"), new Integer(8706) },
    { new String("‰"), new Integer(8240) },
    { new String("⊥"), new Integer(8869) },
    { new String("Φ"), new Integer(934) },
    { new String("φ"), new Integer(966) },
    { new String("Π"), new Integer(928) },
    { new String("π"), new Integer(960) },
    { new String("ϖ"), new Integer(982) },
    { new String("±"), new Integer(177) },
    { new String("£"), new Integer(163) },
    { new String("′"), new Integer(8242) },
    { new String("″"), new Integer(8243) },
    { new String("∏"), new Integer(8719) },
    { new String("∝"), new Integer(8733) },
    { new String("Ψ"), new Integer(936) },
    { new String("ψ"), new Integer(968) },
    { new String("""), new Integer(34) },
    { new String("√"), new Integer(8730) },
    { new String("⟩"), new Integer(9002) },
    { new String("»"), new Integer(187) },
    { new String("→"), new Integer(8594) },
    { new String("⇒"), new Integer(8658) },
    { new String("⌉"), new Integer(8969) },
    { new String("”"), new Integer(8221) },
    { new String("ℜ"), new Integer(8476) },
    { new String("®"), new Integer(174) },
    { new String("⌋"), new Integer(8971) },
    { new String("Ρ"), new Integer(929) },
    { new String("ρ"), new Integer(961) },
    { new String("‏"), new Integer(8207) },
    { new String("›"), new Integer(8250) },
    { new String("’"), new Integer(8217) },
    { new String("‚"), new Integer(8218) },
    { new String("Š"), new Integer(352) },
    { new String("š"), new Integer(353) },
    { new String("⋅"), new Integer(8901) },
    { new String("§"), new Integer(167) },
    { new String("­"), new Integer(173) },
    { new String("Σ"), new Integer(931) },
    { new String("σ"), new Integer(963) },
    { new String("ς"), new Integer(962) },
    { new String("∼"), new Integer(8764) },
    { new String("♠"), new Integer(9824) },
    { new String("⊂"), new Integer(8834) },
    { new String("⊆"), new Integer(8838) },
    { new String("∑"), new Integer(8721) },
    { new String("¹"), new Integer(185) },
    { new String("²"), new Integer(178) },
    { new String("³"), new Integer(179) },
    { new String("⊃"), new Integer(8835) },
    { new String("⊇"), new Integer(8839) },
    { new String("ß"), new Integer(223) },
    { new String("Τ"), new Integer(932) },
    { new String("τ"), new Integer(964) },
    { new String("∴"), new Integer(8756) },
    { new String("Θ"), new Integer(920) },
    { new String("θ"), new Integer(952) },
    { new String("ϑ"), new Integer(977) },
    { new String(" "), new Integer(8201) },
    { new String("Þ"), new Integer(222) },
    { new String("þ"), new Integer(254) },
    { new String("˜"), new Integer(732) },
    { new String("×"), new Integer(215) },
    { new String("™"), new Integer(8482) },
    { new String("Ú"), new Integer(218) },
    { new String("ú"), new Integer(250) },
    { new String("↑"), new Integer(8593) },
    { new String("⇑"), new Integer(8657) },
    { new String("Û"), new Integer(219) },
    { new String("û"), new Integer(251) },
    { new String("Ù"), new Integer(217) },
    { new String("ù"), new Integer(249) },
    { new String("¨"), new Integer(168) },
    { new String("ϒ"), new Integer(978) },
    { new String("Υ"), new Integer(933) },
    { new String("υ"), new Integer(965) },
    { new String("Ü"), new Integer(220) },
    { new String("ü"), new Integer(252) },
    { new String("℘"), new Integer(8472) },
    { new String("Ξ"), new Integer(926) },
    { new String("ξ"), new Integer(958) },
    { new String("Ý"), new Integer(221) },
    { new String("ý"), new Integer(253) },
    { new String("¥"), new Integer(165) },
    { new String("ÿ"), new Integer(255) },
    { new String("Ÿ"), new Integer(376) },
    { new String("Ζ"), new Integer(918) },
    { new String("ζ"), new Integer(950) },
    { new String("‍"), new Integer(8205) },
    { new String("‌"), new Integer(8204) } };
  
  /**
   * Map to convert extended characters in html entities.
   */
  private static final Hashtable htmlentities_map = new Hashtable();
  
  /**
   * Map to convert html entities in exteden characters.
   */
  private static final Hashtable unhtmlentities_map = new Hashtable();
  
  //==============================================================================
  // METHODS
  //==============================================================================
  
  /**
   * Initialize HTML translation maps.
   */
  public HTMLEntities() {
    initializeEntitiesTables();
  }
  
  /**
   * Initialize HTML entities table.
   */
  private static void initializeEntitiesTables() {
    // initialize html translation maps
    for (int i = 0; i < html_entities_table.length; ++i) {
      htmlentities_map.put(html_entities_table[i][1],
          html_entities_table[i][0]);
      unhtmlentities_map.put(html_entities_table[i][0],
          html_entities_table[i][1]);
    }
  }
  
  /**
   * Get the html entities translation table.
   *
   * @return translation table
   */
  public static Object[][] getEntitiesTable() {
    return html_entities_table;
  }
  
  /**
   * Convert special and extended characters into HTML entitities.
   * @param str input string
   * @return formatted string
   * @see #unhtmlentities(String)
   */
  public static String htmlentities(String str) {
    
    if (str == null) {
      return "";
    }
    //initialize html translation maps table the first time is called
    if (htmlentities_map.isEmpty()) {
      initializeEntitiesTables();
    }
    
    StringBuffer buf = new StringBuffer(); //the otput string buffer
    
    for (int i = 0; i < str.length(); ++i) {
      char ch = str.charAt(i);
      String entity = (String) htmlentities_map.get(new Integer((int) ch)); //get equivalent html entity
      if (entity == null) { //if entity has not been found
        if (((int) ch) > 128) { //check if is an extended character
          buf.append("&#" + ((int) ch) + ";"); //convert extended character
        } else {
          buf.append(ch); //append the character as is
        }
      } else {
        buf.append(entity); //append the html entity
      }
    }
    return buf.toString();
  }
  
  /**
   * Convert HTML entities to special and extended unicode characters
   * equivalents.
   * @param str input string
   * @return formatted string
   * @see #htmlentities(String)
   */
  public static String unhtmlentities(String str) {
    
    //initialize html translation maps table the first time is called
    if (htmlentities_map.isEmpty()) {
      initializeEntitiesTables();
    }
    
    StringBuffer buf = new StringBuffer();
    
    for (int i = 0; i < str.length(); ++i) {
      char ch = str.charAt(i);
      if (ch == '&') {
        int semi = str.indexOf(';', i + 1);
        if ((semi == -1) || ((semi-i) > 7)){
          buf.append(ch);
          continue;
        }
        String entity = str.substring(i, semi + 1);
        Integer iso;
        if (entity.charAt(1) == ' ') {
          buf.append(ch);
          continue;
        }
        if (entity.charAt(1) == '#') {
          if (entity.charAt(2) == 'x') {
            iso = new Integer(Integer.parseInt(entity.substring(3, entity.length() - 1), 16));
          }
          else {
            iso = new Integer(entity.substring(2, entity.length() - 1));
          }
        } else {
          iso = (Integer) unhtmlentities_map.get(entity);
        }
        if (iso == null) {
          buf.append(entity);
        } else {
          buf.append((char) (iso.intValue()));
        }
        i = semi;
      } else {
        buf.append(ch);
      }
    }
    return buf.toString();
  }
  
  // methods to convert special characters
  
  /**
   * Replace single quotes characters with HTML entities.
   *
   * @param str the input string
   * @return string with replaced single quotes
   */
  public static String htmlSingleQuotes(String str) {
    str = str.replaceAll("[\']", "’");
    str = str.replaceAll("'", "’");
    str = str.replaceAll("‘", "’");
    str = str.replaceAll("’", "’");
    return str;
  }
  
  /**
   * Replace single quotes HTML entities with equivalent character.
   *
   * @param str the input string
   * @return string with replaced single quotes
   */
  public static String unhtmlSingleQuotes(String str) {
    return str.replaceAll("’", "\'");
  }
  
  /**
   * Replace double quotes characters with HTML entities.
   *
   * @param str the input string
   * @return string with replaced double quotes
   */
  public static String htmlDoubleQuotes(String str) {
    str = str.replaceAll("[\"]", """);
    str = str.replaceAll("“", """);
    str = str.replaceAll("”", """);
    return str;
  }
  
  /**
   * Replace single quotes HTML entities with equivalent character.
   *
   * @param str the input string
   * @return string with replaced single quotes
   */
  public static String unhtmlDoubleQuotes(String str) {
    return str.replaceAll(""", "\"");
  }
  
  /**
   * Replace single and double quotes characters with HTML entities.
   *
   * @param str the input string
   * @return string with replaced quotes
   */
  public static String htmlQuotes(String str) {
    str = htmlDoubleQuotes(str); //convert double quotes
    str = htmlSingleQuotes(str); //convert single quotes
    return str;
  }
  
  /**
   * Replace single and double quotes HTML entities with equivalent characters.
   *
   * @param str the input string
   * @return string with replaced quotes
   */
  public static String unhtmlQuotes(String str) {
    str = unhtmlDoubleQuotes(str); //convert double quotes
    str = unhtmlSingleQuotes(str); //convert single quotes
    return str;
  }
  
  /**
   * Replace < > characters with &lt; &gt; entities.
   *
   * @param str the input string
   * @return string with replaced characters
   */
  public static String htmlAngleBrackets(String str) {
    str = str.replaceAll("<", "<");
    str = str.replaceAll(">", ">");
    return str;
  }
  
  /**
   * Replace &lt; &gt; entities with < > characters.
   *
   * @param str the input string
   * @return string with replaced entities
   */
  public static String unhtmlAngleBrackets(String str) {
    str = str.replaceAll("<", "<");
    str = str.replaceAll(">", ">");
    return str;
  }
  
  /**
   * Replace & characters with &amp; HTML entities.
   *
   * @param str the input string
   * @return string with replaced characters
   */
  public static String htmlAmpersand(String str) {
    return str.replaceAll("&", "&");
  }
  
  /**
   * Replace &amp; HTML entities with & characters.
   *
   * @param str the input string
   * @return string with replaced entities
   */
  public static String unhtmlAmpersand(String str) {
    return str.replaceAll("&", "&");
  }
}

No comments: