class BoilerpipeArticle
Public Class Methods
new(html)
click to toggle source
# File lib/boilerpipe_article.rb, line 7 def initialize(html) @html = html.gsub(/\s\s+/,' ') @articlesStats = Hash.new end
Public Instance Methods
calculateBestDepth(articlesStats)
click to toggle source
# File lib/boilerpipe_article.rb, line 96 def calculateBestDepth(articlesStats) bestDepth = Hash.new(0) articlesStats.each do |line,stats| bestDepth[stats[1]]+=stats[0].length end bestvalues = bestDepth.sort_by {|key,value|value}.reverse.to_h average = 0.0 bestDepth.each {|l,v|average+=v/bestDepth.keys.length.to_f} texts = 0 bestDepth.each{|l,v|texts +=1 if v > average} doubleTexts = false doubleTexts = true if texts >= 2 best = bestvalues.keys[0] return best,doubleTexts end
calculateDepth(html = @html)
click to toggle source
# File lib/boilerpipe_article.rb, line 42 def calculateDepth(html = @html) articlesStats = Hash.new doc = Nokogiri::HTML(html) i = 0 doc.xpath('//text()').each do |node| text = node.to_s articlesStats.store(i,[node.text.to_s,node.ancestors.length.to_i,node.parent.name]) i+=1 end return articlesStats end
getAllText(html = @html)
click to toggle source
# File lib/boilerpipe_article.rb, line 169 def getAllText(html = @html) doc = Nokogiri.parse(html) doc.search('script').remove doc.search('style').remove return doc.text.gsub(/\s\s+/,' ') end
getArticle(html = @html)
click to toggle source
# File lib/boilerpipe_article.rb, line 175 def getArticle(html = @html) html = removeBadHtmlTags(html) articlesStats = calculateDepth(html) best,doubleTexts = calculateBestDepth(articlesStats) if doubleTexts html = removeSamePatterns(html) articlesStats,d = calculateDepth(html) end bestDepth,doubles = calculateBestDepth(articlesStats) plainText = getTextOfBestDepth(articlesStats,bestDepth) return plainText end
getMetas(html = @html)
click to toggle source
# File lib/boilerpipe_article.rb, line 124 def getMetas(html = @html) metas = Hash.new doc = Nokogiri.parse(html) doc.xpath("//meta").each do |node| name = node[node.attributes.keys[1]] name = node[node.attributes.keys[0]] if node.attributes.keys[0] != 'content' && node.attributes.keys[0] != 'value' content = node['content'] content = node['value'] if content == nil metas.store(name,content) end return metas end
getMicroData(html = @html)
click to toggle source
# File lib/boilerpipe_article.rb, line 161 def getMicroData(html = @html) doc = Mida::Document.new(html, "") topLevel = Array.new doc.items.each do |item| topLevel.push(item.to_h) end return topLevel end
getOtherHTMLDescriptions(html = @html)
click to toggle source
# File lib/boilerpipe_article.rb, line 137 def getOtherHTMLDescriptions(html = @html) doc = Nokogiri.parse(html) images = Array.new headlines = Hash.new links = Hash.new 5.times do |i| hs = doc.xpath("//h#{i+1}") texts = [] hs.each {|node| texts.push(node.text.to_s)} headlines.store("h#{i+1}",texts) end imgs = doc.xpath('//img/@src') imgs.each do |source| images.push(source.text) if source.text.include?('http') end plinks = doc.xpath('//a/@href') plinks.each do |source| links.store(source.text,1) if source.text.strip.length > 2 end return {'headlines'=>headlines,'images'=>images, 'links' => links.keys} end
getTextOfBestDepth(articlesStats,best)
click to toggle source
# File lib/boilerpipe_article.rb, line 114 def getTextOfBestDepth(articlesStats,best) text = '' articlesStats.each do |line,stats| if stats[1] == best && (stats[-1].eql?('h1') || stats[-1].eql?('h2') || stats[-1].eql?('p')) text = "#{text} <#{stats[-1]}>#{stats[0]}</#{stats[-1]}>" if stats[0].strip.length > 2 end end return text end
removeBadHtmlTags(html = @html)
click to toggle source
# File lib/boilerpipe_article.rb, line 11 def removeBadHtmlTags(html = @html) html = Nokogiri::HTML.parse(html).to_s html.gsub!(/<!-[\s\S]*?->/, '') html.gsub!(/\r?\n|\r/, '') unwantedTags = ['strong','bold','i'] unwantedTags.each do |tag| html.gsub!("<#{tag}>",'') html.gsub!("</#{tag}>",'') end doc = Nokogiri::HTML(html) badHtmlTags = ['script','style','head','nav','iframe','img','footer','ol','ul','li','a'] doc.css('*').each do |node| node.remove if node.text.length < 3 end badHtmlTags.each do |tag| doc.search(tag).each do |src| src.remove end end # doc.css('a').each do |atag| # atag = "#{atag.text}" # puts atag # end html = doc.to_html.to_s return html end
removeSamePatterns(html)
click to toggle source
# File lib/boilerpipe_article.rb, line 53 def removeSamePatterns(html) doc = Nokogiri::HTML(html) paths = Array.new doc.css('*').each do |node| s = node.path.gsub(/\[[\s\S]*?\]/, '') paths.push(s) end final = [] (7..30).each do |i| all = [] paths.each_with_index do |seq,a| se = [] paths[a..-1].each_with_index do |s,ii| se << s break if ii == i-1 end all << se end final << all end allDoubles = Hash.new final.each_with_index do |seq,i| counts = Hash.new(0) seq.each do |name| counts[name] += 1 end counts = counts.sort_by{|k,v|v}.reverse.to_h allDoubles.store(i,counts) end allDoubles.each do |i,doubles| doubles.each do |path,count| if count >= 7 doc.css('*').each do |node| s = node.path.gsub(/\[[\s\S]*?\]/, '') if path.include? s node.remove end end end end end return doc.to_s end