class HtmlCodeCleaner

HtmlCodeCleaner

This class has a single dictionary of html code and its corresponding symbol. The code takes a string as input, and removes any html code and inserts the corresponding symbol. The output is a string.

Public Class Methods

clean_string(string) click to toggle source

removes character codes from string must be defined in the dictionary

# File lib/HtmlCodeCleaner.rb, line 10
def self.clean_string(string)
  if (string != nil)
    # This is the dictionary containing the html code and it's replacement symbol.
    dict = [  
              [' '," "],
              ['!',"!"],
              ['"','"'],        ['"','"'],
              ['#','#'],
              ['$',"$"],
              ['%',"%"],
              ['&',"&"],        ['&',"&"],
              [''',"'"],
              ['(',"("],
              [')',")"],
              ['*',"*"],
              ['+',"+"],
              [',',","],
              ['-',"-"],
              ['.',"0"],
              ['/',"/"],
              ['0',"0"],
              ['1',"1"],
              ['2',"2"],
              ['3',"3"],
              ['4',"4"],
              ['5',"5"],
              ['6',"6"],
              ['7',"7"],
              ['8',"8"],
              ['9',"9"],
              [':',":"],
              ['&#59;',";"],
              ['&#60;',"<"],        ['&lt;',"<"],
              ['&#61;',"="],
              ['&#62;',">"],        ['&gt;',">"],
              ['&#63;',"?"],
              ['&#64;',"@"],
              ['&#65;',"A"],        ['&#97;',"a"],
              ['&#66;',"B"],        ['&#98;',"b"],
              ['&#67;',"C"],        ['&#99;',"c"],
              ['&#68;',"D"],        ['&#100;',"d"],
              ['&#69;',"E"],        ['&#101;',"e"],
              ['&#70;',"F"],        ['&#102;',"f"],
              ['&#71;',"G"],        ['&#103;',"g"],
              ['&#72;',"H"],        ['&#104;',"h"],
              ['&#73;',"I"],        ['&#105;',"i"],
              ['&#74;',"J"],        ['&#106;',"j"],
              ['&#75;',"K"],        ['&#107;',"k"],
              ['&#76;',"L"],        ['&#108;',"l"],
              ['&#77;',"M"],        ['&#109;',"m"],
              ['&#78;',"N"],        ['&#110;',"n"],
              ['&#79;',"O"],        ['&#111;',"o"],
              ['&#80;',"P"],        ['&#112;',"p"],
              ['&#81;',"Q"],        ['&#113;',"q"],
              ['&#82;',"R"],        ['&#114;',"r"],
              ['&#83;',"S"],        ['&#115;',"s"],
              ['&#84;',"T"],        ['&#116;',"t"],
              ['&#85;',"U"],        ['&#117;',"u"],
              ['&#86;',"V"],        ['&#118;',"v"],
              ['&#87;',"W"],        ['&#119;',"w"],
              ['&#88;',"X"],        ['&#120;',"x"],
              ['&#89;',"Y"],        ['&#121;',"y"],
              ['&#90;',"Z"],        ['&#122;',"z"],
              ['&#91;',"["],
              ['&#92;','\ '],
              ['&#93;',"]"],
              ['&#94;',"^"],
              ['&#95;',"_"],
              ['&#96;',"`"],
              ['&#123;',"{"],
              ['&#124;',"|"],
              ['&#125;',"}"],
              ['&#126;',"~"],
              ['&#128;',"€"],
              ['&#129;'," "],
              ['&#130;',"‚"],
              ['&#131;',"ƒ"],
              ['&#132;',"„"],
              ['&#133;',"…"],
              ['&#134;',"†"],
              ['&#135;',"‡"],
              ['&#136;',"ˆ"],
              ['&#137;',"‰"],
              ['&#138;',"Š"],
              ['&#139;',"‹"],
              ['&#140;',"Œ"],
              ['&#141;'," "],
              ['&#142;',"Ž"],
              ['&#143;'," "],
              ['&#144;'," "],
              ['&#145;',"‘"],
              ['&#146;',"’"],
              ['&#147;','"'],
              ['&#148;',"”"],
              ['&#149;',"•"],
              ['&#150;',"–"],
              ['&#151;',"—"],
              ['&#152;',"˜"],
              ['&#153;',"™"],
              ['&#154;',"š"],
              ['&#155;',"›"],
              ['&#156;',"œ"],
              ['&#157;'," "],
              ['&#158;',"ž"],
              ['&#159;',"Ÿ"],
              ['&#160;'," "],       ['&nbsp;'," "],
              ['&#161;',"¡"],       ['&iexcl;',"¡"],
              ['&#162;',"¢"],       ['&cent;',"¢"],
              ['&#163;',"£"],       ['&pound;',"£"],
              ['&#164;',"¤"],       ['&curren;',"¤"],
              ['&#165;',"¥"],       ['&yen;',"¥"],
              ['&#166;',"¦"],       ['&brvbar;',"¦"],
              ['&#167;',"§"],       ['&sect;',"§"],
              ['&#168;',"¨"],       ['&uml;',"¨"],
              ['&#169;',"©"],       ['&copy;',"©"],
              ['&#170;',"ª"],       ['&ordf;',"ª"],
              ['&#171;',"«"],       ['&laquo;',"«"],
              ['&#172;',"¬"],       ['&not;',"¬"],
              ['&#173;',""],        ['&shy;',""],
              ['&#174;',"®"],       ['&reg;',"®"],
              ['&#175;',"¯"],       ['&macr;',"¯"],
              ['&#176;',"°"],       ['&deg;',"°"],
              ['&#177;',"±"],       ['&plusmn;',"±"],
              ['&#178;',"²"],       ['&sup2;',"²"],
              ['&#179;',"³"],       ['&sup3;',"³"],
              ['&#180;',"´"],       ['&acute;',"´"],
              ['&#181;',"µ"],       ['&micro;',"µ"],
              ['&#182;',"¶"],       ['&para;',"¶"],
              ['&#183;',"·"],       ['&middot;',"·"],
              ['&#184;',"¸"],       ['&cedil;',"¸"],
              ['&#185;',"¹"],       ['&sup1;',"¹"],
              ['&#186;',"º"],       ['&ordm;',"º"],
              ['&#187;',"»"],       ['&raquo;',"»"],
              ['&#188;',"¼"],       ['&frac14;',"¼"],
              ['&#189;',"½"],       ['&frac12;',"½"],
              ['&#190;',"¾"],       ['&frac34;',"¾"],
              ['&#191;',"¿"],       ['&iquest;',"¿"],
              ['&#192;',"À"],
              ['&#193;',"Á"],
              ['&#194;',"Â"],
              ['&#195;',"Ã"],
              ['&#196;',"Ä"],
              ['&#197;',"Å"],
              ['&#198;',"Æ"],
              ['&#199;',"Ç"],
              ['&#200;',"È"],
              ['&#201;',"É"],
              ['&#202;',"Ê"],
              ['&#203;',"Ë"],
              ['&#204;',"Ì"],
              ['&#205;',"Í"],
              ['&#206;',"Î"],
              ['&#207;',"Ï"],
              ['&#208;',"Ð"],
              ['&#209;',"Ñ"],
              ['&#210;',"Ò"],
              ['&#211;',"Ó"],
              ['&#212;',"Ô"],
              ['&#213;',"Õ"],
              ['&#214;',"Ö"],
              ['&#215;',"×"],
              ['&#216;',"Ø"],
              ['&#217;',"Ù"],
              ['&#218;',"Ú"],
              ['&#219;',"Û"],
              ['&#220;',"Ü"],
              ['&#221;',"Ý"],
              ['&#222;',"Þ"],
              ['&#223;',"ß"],
              ['&#224;',"à"],
              ['&#225;',"á"],
              ['&#226;',"â"],
              ['&#227;',"ã"],
              ['&#228;',"ä"],
              ['&#229;',"å"],
              ['&#230;',"æ"],
              ['&#231;',"ç"],
              ['&#232;',"è"],
              ['&#233;',"é"],
              ['&#234;',"ê"],
              ['&#235;',"ë"],
              ['&#236;',"ì"],
              ['&#237;',"í"],
              ['&#238;',"î"],
              ['&#239;',"ï"],
              ['&#240;',"ð"],
              ['&#241;',"ñ"],
              ['&#242;',"ò"],
              ['&#243;',"ó"],
              ['&#244;',"ô"],
              ['&#245;',"õ"],
              ['&#246;',"ö"],
              ['&#247;',"÷"],
              ['&#248;',"ø"],
              ['&#249;',"ù"],
              ['&#250;',"ú"],
              ['&#251;',"û"],
              ['&#252;',"ü"],
              ['&#253;',"ý"],
              ['&#254;',"þ"],
              ['&#255;',"ÿ"],
              ['&#338;',"Œ"],
              ['&#339;',"œ"],
              ['&#352;',"Š"],
              ['&#353;',"š"],
              ['&#376;',"Ÿ"],
              ['&#402;',"ƒ"],['&fnof;',"ƒ"],
              ['&#913;',"Α"],['&Alpha;',"Α"],
              ['&#914;',"Β"],['&Beta;',"Β"],
              ['&#915;',"Γ"],['&Gamma;',"Γ"],
              ['&#916;',"Δ"],['&Delta;',"Δ"],
              ['&#917;',"Ε"],['&Epsilon;',"Ε"],
              ['&#918;',"Ζ"],['&Zeta;',"Ζ"],
              ['&#919;',"Η"],['&Eta;',"Η"],
              ['&#920;',"Θ"],['&Theta;',"Θ"],
              ['&#921;',"Ι"],['&Iota;',"Ι"],
              ['&#922;',"Κ"],['&Kappa;',"Κ"],
              ['&#923;',"Λ"],['&Lambda;',"Λ"],
              ['&#924;',"Μ"],['&Mu;',"Μ"],
              ['&#925;',"Ν"],['&Nu;',"Ν"],
              ['&#926;',"Ξ"],['&Xi;',"Ξ"],
              ['&#927;',"Ο"],['&Omicron;',"Ο"],
              ['&#928;',"Π"],['&Pi;',"Π"],
              ['&#929;',"Ρ"],['&Rho;',"Ρ"],
              ['&#931;',"Σ"],['&Sigma;',"Σ"],
              ['&#932;',"Τ"],['&Tau;',"Τ"],
              ['&#933;',"Υ"],['&Upsilon;',"Υ"],
              ['&#934;',"Φ"],['&Phi;',"Φ"],
              ['&#935;',"Χ"],['&Chi;',"Χ"],
              ['&#936;',"Ψ"],['&Psi;',"Ψ"],
              ['&#937;',"Ω"],['&Omega;',"Ω"],
              ['&#945;',"α"],['&alpha;',"α"],
              ['&#946;',"β"],['&beta;',"β"],
              ['&#947;',"γ"],['&gamma;',"γ"],
              ['&#948;',"δ"],['&delta;',"δ"],
              ['&#949;',"ε"],['&epsilon;',"ε"],
              ['&#950;',"ζ"],['&zeta;',"ζ"],
              ['&#951;',"η"],['&eta;',"η"],
              ['&#952;',"θ"],['&theta;',"θ"],
              ['&#953;',"ι"],['&iota;',"ι"],
              ['&#954;',"κ"],['&kappa;',"κ"],
              ['&#955;',"λ"],['&lambda;',"λ"],
              ['&#956;',"μ"],['&mu;',"μ"],
              ['&#957;',"ν"],['&nu;',"ν"],
              ['&#958;',"ξ"],['&xi;',"ξ"],
              ['&#959;',"ο"],['&omicron;',"ο"],
              ['&#960;',"π"],['&pi;',"π"],
              ['&#961;',"ρ"],['&rho;',"ρ"],
              ['&#962;',"ς"],['&sigmaf;',"ς"],
              ['&#963;',"σ"],['&sigma;',"σ"],
              ['&#964;',"τ"],['&tau;',"τ"],
              ['&#965;',"υ"],['&upsilon;',"υ"],
              ['&#966;',"φ"],['&phi;',"φ"],
              ['&#967;',"χ"],['&chi;',"χ"],
              ['&#968;',"ψ"],['&psi;',"ψ"],
              ['&#969;',"ω"],['&omega;',"ω"],
              ['&#977;',"ϑ"],['&thetasym;',"ϑ"],
              ['&#978;',"ϒ"],['&upsih;',"ϒ"],
              ['&#982;',"ϖ"],['&piv;',"ϖ"],
              ['&#8211;',"–"],
              ['&#8212;',"—"],
              ['&#8216;',"‘"],
              ['&#8217;',"’"],
              ['&#8218;',"‚"],
              ['&#8220;',"“"],
              ['&#8221;',"”"],
              ['&#8222;',"„"],
              ['&#8224;',"†"],
              ['&#8225;',"‡"],
              ['&#8226;',"•"],['&bull;',"•"],
              ['&#8230;',"…"],['&hellip;',"…"],
              ['&#8240;',"‰"],
              ['&#8242;',"′"],['&prime;',"′"],
              ['&#8243;',"″"],['&Prime;',"″"],
              ['&#8254;',"‾"],['&oline;',"‾"],
              ['&#8260;',"⁄"],['&frasl;',"⁄"],
              ['&#8364;',"€"],['&euro;',"€"],       
              ['&#8465;',"ℑ"],['&image;',"ℑ"],
              ['&#8472;',"℘"],['&weierp;',"℘"],
              ['&#8476;',"ℜ"],['&real;',"ℜ"],
              ['&#8482;',"™"],['&trade;',"™"],
              ['&#8501;',"ℵ"],['&alefsym;',"ℵ"],
              ['&#8592;',"←"],['&larr;',"←"],
              ['&#8593;',"↑"],['&uarr;',"↑"],
              ['&#8594;',"→"],['&rarr;',"→"],
              ['&#8595;',"↓"],['&darr;',"↓"],
              ['&#8596;',"↔"],['&harr;',"↔"],
              ['&#8629;',"↵"],['&crarr;',"↵"],
              ['&#8656;',"⇐"],['&lArr;',"⇐"],
              ['&#8657;',"⇑"],['&uArr;',"⇑"],
              ['&#8658;',"⇒"],['&rArr;',"⇒"],
              ['&#8659;',"⇓"],['&dArr;',"⇓"],
              ['&#8660;',"⇔"],['&hArr;',"⇔"],
              ['&#8704;',"∀"],['&forall;',"∀"],
              ['&#8706;',"∂"],['&part;',"∂"],
              ['&#8707;',"∃"],['&exist;',"∃"],
              ['&#8709;',"∅"],['&empty;',"∅"],
              ['&#8711;',"∇"],['&nabla;',"∇"],
              ['&#8712;',"∈"],['&isin;',"∈"],
              ['&#8713;',"∉"],['&notin;',"∉"],
              ['&#8715;',"∋"],['&ni;',"∋"],
              ['&#8719;',"∏"],['&prod;',"∏"],
              ['&#8721;',"∑"],['&sum;',"∑"],
              ['&#8722;',"−"],['&minus;',"−"],
              ['&#8727;',"∗"],['&lowast;',"∗"],
              ['&#8730;',"√"],['&radic;',"√"],
              ['&#8733;',"∝"],['&prop;',"∝"],
              ['&#8734;',"∞"],['&infin;',"∞"],
              ['&#8736;',"∠"],['&ang;',"∠"],
              ['&#8743;',"∧"],['&and;',"∧"],
              ['&#8744;',"∨"],['&or;',"∨"],
              ['&#8745;',"∩"],['&cap;',"∩"],
              ['&#8746;',"∪"],['&cup;',"∪"],
              ['&#8747;',"∫"],['&int;',"∫"],
              ['&#8756;',"∴"],['&there4;',"∴"],
              ['&#8764;',"∼"],['&sim;',"∼"],
              ['&#8773;',"≅"],['&cong;',"≅"],
              ['&#8776;',"≈"],['&asymp;',"≈"],
              ['&#8800;',"≠"],['&ne;',"≠"],
              ['&#8801;',"≡"],['&equiv;',"≡"],
              ['&#8804;',"≤"],['&le;',"≤"],
              ['&#8805;',"≥"],['&ge;',"≥"],
              ['&#8834;',"⊂"],['&sub;',"⊂"],
              ['&#8835;',"⊃"],['&sup;',"⊃"],
              ['&#8836;',"⊄"],['&nsub;',"⊄"],
              ['&#8838;',"⊆"],['&sube;',"⊆"],
              ['&#8839;',"⊇"],['&supe;',"⊇"],
              ['&#8853;',"⊕"],['&oplus;',"⊕"],
              ['&#8855;',"⊗"],['&otimes;',"⊗"],
              ['&#8869;',"⊥"],['&perp;',"⊥"],
              ['&#8901;',"⋅"],['&sdot;',"⋅"],
              ['&#8968;',"⌈"],['&lceil;',"⌈"],
              ['&#8969;',"⌉"],['&rceil;',"⌉"],
              ['&#8970;',"⌊"],['&lfloor;',"⌊"],
              ['&#8971;',"⌋"],['&rfloor;',"⌋"],
              ['&#9001;',"⟨"],['&lang;',"⟨"],
              ['&#9002;',"⟩"],['&rang;',"⟩"],
              ['&#9674;',"◊"],['&loz;',"◊"],
              ['&#9824;',"♠"],['&spades;',"♠"],
              ['&#9827;',"♣"],['&clubs;',"♣"],
              ['&#9829;',"♥"],['&hearts;',"♥"],
              ['&#9830;',"♦"],['&diams;',"♦"]
              ]
    # Parse the string of the html code and make the replacement
    dict.each { |x| string = string.gsub(x.first, x.last) }
    # return the string
    string 
  end
end