package weibo4j.http;
import java.util.HashMap; import java.util.Map;
public class HTMLEntity { public static String escape(String original) { StringBuffer buf = new StringBuffer(original); escape(buf); return buf.toString(); }
public static void escape(StringBuffer original) { int index = 0; String escaped; while (index < original.length()) { escaped = entityEscapeMap.get(original.substring(index, index + 1)); if (null != escaped) { original.replace(index, index + 1, escaped); index += escaped.length(); } else { index++; } } }
public static String unescape(String original) { StringBuffer buf = new StringBuffer(original); unescape(buf); return buf.toString(); }
public static void unescape(StringBuffer original) { int index = 0; int semicolonIndex = 0; String escaped; String entity; while (index < original.length()) { index = original.indexOf("&", index); if (-1 == index) { break; } semicolonIndex = original.indexOf(";", index); if (-1 != semicolonIndex && 10 > (semicolonIndex - index)) { escaped = original.substring(index, semicolonIndex + 1); entity = escapeEntityMap.get(escaped); if (null != entity) { original.replace(index, semicolonIndex + 1, entity); } index++; } else { break; } } }
private static Map<String, String> entityEscapeMap = new HashMap<String, String>(); private static Map<String, String> escapeEntityMap = new HashMap<String, String>();
static { String[][] entities = { { " ", " ", "\u00A0" }, { "¡", "¡", "\u00A1" }, { "¢", "¢", "\u00A2" }, { "£", "£", "\u00A3" }, { "¤", "¤", "\u00A4" }, { "¥", "¥", "\u00A5" }, { "¦", "¦", "\u00A6" }, { "§", "§", "\u00A7" }, { "¨", "¨", "\u00A8" }, { "©", "©", "\u00A9" }, { "ª", "ª", "\u00AA" }, { "«", "«" * left-pointing double angle quotation mark * = left pointing guillemet */, "\u00AB" }, { "¬", "¬", "\u00AC" }, { "­", "­", "\u00AD" }, { "®", "®" * registered sign = registered trade mark * sign */, "\u00AE" }, { "¯", "¯" * macron = spacing macron = overline = APL * overbar */, "\u00AF" }, { "°", "°", "\u00B0" }, { "±", "±", "\u00B1" }, { "²", "²" * superscript two = superscript digit two = * squared */, "\u00B2" }, { "³", "³" * superscript three = superscript digit * three = cubed */, "\u00B3" }, { "´", "´", "\u00B4" }, { "µ", "µ", "\u00B5" }, { "¶", "¶", "\u00B6" }, { "·", "·" * middle dot = Georgian comma = Greek * middle dot */, "\u00B7" }, { "¸", "¸", "\u00B8" }, { "¹", "¹", "\u00B9" }, { "º", "º", "\u00BA" }, { "»", "»" * right-pointing double angle quotation * mark = right pointing guillemet */, "\u00BB" }, { "¼", "¼" * vulgar fraction one quarter = fraction * one quarter */, "\u00BC" }, { "½", "½" * vulgar fraction one half = fraction one * half */, "\u00BD" }, { "¾", "¾" * vulgar fraction three quarters = fraction * three quarters */, "\u00BE" }, { "¿", "¿" * inverted question mark = turned question * mark */, "\u00BF" }, { "À", "À" * latin capital letter A with grave = latin * capital letter A grave */, "\u00C0" }, { "Á", "Á", "\u00C1" }, { "Â", "Â", "\u00C2" }, { "Ã", "Ã", "\u00C3" }, { "Ä", "Ä", "\u00C4" }, { "Å", "Å" * latin capital letter A with ring above = * latin capital letter A ring */, "\u00C5" }, { "Æ", "Æ" * latin capital letter AE = latin capital * ligature AE */, "\u00C6" }, { "Ç", "Ç", "\u00C7" }, { "È", "È", "\u00C8" }, { "É", "É", "\u00C9" }, { "Ê", "Ê", "\u00CA" }, { "Ë", "Ë", "\u00CB" }, { "Ì", "Ì", "\u00CC" }, { "Í", "Í", "\u00CD" }, { "Î", "Î", "\u00CE" }, { "Ï", "Ï", "\u00CF" }, { "Ð", "Ð", "\u00D0" }, { "Ñ", "Ñ", "\u00D1" }, { "Ò", "Ò", "\u00D2" }, { "Ó", "Ó", "\u00D3" }, { "Ô", "Ô", "\u00D4" }, { "Õ", "Õ", "\u00D5" }, { "Ö", "Ö", "\u00D6" }, { "×", "×", "\u00D7" }, { "Ø", "Ø" * latin capital letter O with stroke = * latin capital letter O slash */, "\u00D8" }, { "Ù", "Ù", "\u00D9" }, { "Ú", "Ú", "\u00DA" }, { "Û", "Û", "\u00DB" }, { "Ü", "Ü", "\u00DC" }, { "Ý", "Ý", "\u00DD" }, { "Þ", "Þ", "\u00DE" }, { "ß", "ß", "\u00DF" }, { "à", "à" * latin small letter a with grave = latin * small letter a grave */, "\u00E0" }, { "á", "á", "\u00E1" }, { "â", "â", "\u00E2" }, { "ã", "ã", "\u00E3" }, { "ä", "ä", "\u00E4" }, { "å", "å" * latin small letter a with ring above = * latin small letter a ring */, "\u00E5" }, { "æ", "æ" * latin small letter ae = latin small * ligature ae */, "\u00E6" }, { "ç", "ç", "\u00E7" }, { "è", "è", "\u00E8" }, { "é", "é", "\u00E9" }, { "ê", "ê", "\u00EA" }, { "ë", "ë", "\u00EB" }, { "ì", "ì", "\u00EC" }, { "í", "í", "\u00ED" }, { "î", "î", "\u00EE" }, { "ï", "ï", "\u00EF" }, { "ð", "ð", "\u00F0" }, { "ñ", "ñ", "\u00F1" }, { "ò", "ò", "\u00F2" }, { "ó", "ó", "\u00F3" }, { "ô", "ô", "\u00F4" }, { "õ", "õ", "\u00F5" }, { "ö", "ö", "\u00F6" }, { "÷", "÷", "\u00F7" }, { "ø", "ø" * latin small letter o with stroke = latin * small letter o slash */, "\u00F8" }, { "ù", "ù", "\u00F9" }, { "ú", "ú", "\u00FA" }, { "û", "û", "\u00FB" }, { "ü", "ü", "\u00FC" }, { "ý", "ý", "\u00FD" }, { "þ", "þ", "\u00FE" }, { "ÿ", "ÿ", "\u00FF" }, { "ƒ", "ƒ" * latin small f with hook = function = * florin */, "\u0192" } , { "Α", "Α", "\u0391" }, { "Β", "Β", "\u0392" }, { "Γ", "Γ", "\u0393" }, { "Δ", "Δ", "\u0394" }, { "Ε", "Ε", "\u0395" }, { "Ζ", "Ζ", "\u0396" }, { "Η", "Η", "\u0397" }, { "Θ", "Θ", "\u0398" }, { "Ι", "Ι", "\u0399" }, { "Κ", "Κ", "\u039A" }, { "Λ", "Λ", "\u039B" }, { "Μ", "Μ", "\u039C" }, { "Ν", "Ν", "\u039D" }, { "Ξ", "Ξ", "\u039E" }, { "Ο", "Ο", "\u039F" }, { "Π", "Π", "\u03A0" }, { "Ρ", "Ρ", "\u03A1" } , { "Σ", "Σ", "\u03A3" }, { "Τ", "Τ", "\u03A4" }, { "Υ", "Υ", "\u03A5" }, { "Φ", "Φ", "\u03A6" }, { "Χ", "Χ", "\u03A7" }, { "Ψ", "Ψ", "\u03A8" }, { "Ω", "Ω", "\u03A9" }, { "α", "α", "\u03B1" }, { "β", "β", "\u03B2" }, { "γ", "γ", "\u03B3" }, { "δ", "δ", "\u03B4" }, { "ε", "ε", "\u03B5" }, { "ζ", "ζ", "\u03B6" }, { "η", "η", "\u03B7" }, { "θ", "θ", "\u03B8" }, { "ι", "ι", "\u03B9" }, { "κ", "κ", "\u03BA" }, { "λ", "λ", "\u03BB" }, { "μ", "μ", "\u03BC" }, { "ν", "ν", "\u03BD" }, { "ξ", "ξ", "\u03BE" }, { "ο", "ο", "\u03BF" }, { "π", "π", "\u03C0" }, { "ρ", "ρ", "\u03C1" }, { "ς", "ς", "\u03C2" }, { "σ", "σ", "\u03C3" }, { "τ", "τ", "\u03C4" }, { "υ", "υ", "\u03C5" }, { "φ", "φ", "\u03C6" }, { "χ", "χ", "\u03C7" }, { "ψ", "ψ", "\u03C8" }, { "ω", "ω", "\u03C9" }, { "ϑ", "ϑ", "\u03D1" }, { "ϒ", "ϒ", "\u03D2" }, { "ϖ", "ϖ", "\u03D6" } , { "•", "•", "\u2022" } , { "…", "…" * horizontal ellipsis = three dot * leader */, "\u2026" }, { "′", "′", "\u2032" }, { "″", "″", "\u2033" }, { "‾", "‾", "\u203E" }, { "⁄", "⁄", "\u2044" } , { "℘", "℘" * script capital P = power set = * Weierstrass p */, "\u2118" }, { "ℑ", "ℑ", "\u2111" }, { "ℜ", "ℜ", "\u211C" }, { "™", "™", "\u2122" }, { "ℵ", "ℵ" * alef symbol = first transfinite * cardinal */, "\u2135" } , { "←", "←", "\u2190" }, { "↑", "↑", "\u2191" }, { "→", "→", "\u2192" }, { "↓", "↓", "\u2193" }, { "↔", "↔", "\u2194" }, { "↵", "↵" * downwards arrow with corner leftwards = * carriage return */, "\u21B5" }, { "⇐", "⇐", "\u21D0" } * Unicode does not say that lArr is the same as the 'is implied * by' arrow but also does not have any other character for that * function. So ? lArr can be used for 'is implied by' as * ISOtech suggests */ , { "⇑", "⇑", "\u21D1" }, { "⇒", "⇒", "\u21D2" } * Unicode does not say this is the 'implies' character but does * not have another character with this function so ? rArr can * be used for 'implies' as ISOtech suggests */ , { "⇓", "⇓", "\u21D3" }, { "⇔", "⇔", "\u21D4" } , { "∀", "∀", "\u2200" }, { "∂", "∂", "\u2202" }, { "∃", "∃", "\u2203" }, { "∅", "∅", "\u2205" }, { "∇", "∇", "\u2207" }, { "∈", "∈", "\u2208" }, { "∉", "∉", "\u2209" }, { "∋", "∋", "\u220B" } , { "∏", "∏", "\u220F" } , { "∑", "∑", "\u2211" } , { "−", "−", "\u2212" }, { "∗", "∗", "\u2217" }, { "√", "√", "\u221A" }, { "∝", "∝", "\u221D" }, { "∞", "∞", "\u221E" }, { "∠", "∠", "\u2220" }, { "∧", "∧", "\u2227" }, { "∨", "∨", "\u2228" }, { "∩", "∩", "\u2229" }, { "∪", "∪", "\u222A" }, { "∫", "∫", "\u222B" }, { "∴", "∴", "\u2234" }, { "∼", "∼", "\u223C" } * tilde operator is NOT the same character as the tilde * ,"\u007E"} */ , { "≅", "≅", "\u2245" }, { "≈", "≈", "\u2248" }, { "≠", "≠", "\u2260" }, { "≡", "≡", "\u2261" }, { "≤", "≤", "\u2264" }, { "≥", "≥", "\u2265" }, { "⊂", "⊂", "\u2282" }, { "⊃", "⊃", "\u2283" } , { "⊆", "⊆", "\u2286" }, { "⊇", "⊇", "\u2287" }, { "⊕", "⊕", "\u2295" }, { "⊗", "⊗", "\u2297" }, { "⊥", "⊥", "\u22A5" }, { "⋅", "⋅", "\u22C5" } * dot operator is NOT the same character as ,"\u00B7"} /* * Miscellaneous Technical */ , { "⌈", "⌈", "\u2308" }, { "⌉", "⌉", "\u2309" }, { "⌊", "⌊", "\u230A" }, { "⌋", "⌋", "\u230B" }, { "⟨", "〈", "\u2329" } , { "⟩", "〉", "\u232A" } , { "◊", "◊", "\u25CA" } , { "♠", "♠", "\u2660" } , { "♣", "♣", "\u2663" }, { "♥", "♥", "\u2665" }, { "♦", "♦", "\u2666" }, { """, """ , "\"" }, { "&", "&" , "\u0026" }, { "<", "<" , "\u003C" }, { ">", ">" , "\u003E" } , { "Œ", "Œ" , "\u0152" }, { "œ", "œ" , "\u0153" } * ligature is a misnomer this is a separate character in some * languages */ , { "Š", "Š" , "\u0160" }, { "š", "š" , "\u0161" }, { "Ÿ", "Ÿ" , "\u0178" } , { "ˆ", "ˆ" , "\u02C6" }, { "˜", "˜" , "\u02DC" } , { " ", " ", "\u2002" }, { " ", " ", "\u2003" }, { " ", " ", "\u2009" }, { "‌", "‌", "\u200C" }, { "‍", "‍", "\u200D" }, { "‎", "‎", "\u200E" }, { "‏", "‏", "\u200F" }, { "–", "–", "\u2013" }, { "—", "—", "\u2014" }, { "‘", "‘", "\u2018" }, { "’", "’", "\u2019" }, { "‚", "‚", "\u201A" }, { "“", "“", "\u201C" }, { "”", "”", "\u201D" }, { "„", "„", "\u201E" }, { "†", "†", "\u2020" }, { "‡", "‡", "\u2021" }, { "‰", "‰", "\u2030" }, { "‹", "‹" * single left-pointing angle quotation * mark */, "\u2039" } , { "›", "›" * single right-pointing angle quotation * mark */, "\u203A" } , { "€", "€" , "\u20AC" } }; for (String[] entity : entities) { entityEscapeMap.put(entity[2], entity[0]); escapeEntityMap.put(entity[0], entity[2]); escapeEntityMap.put(entity[1], entity[2]); } }
}
|