class BuntoImport::Importers::Tumblr
Public Class Methods
# File lib/bunto-import/importers/tumblr.rb, line 25 def self.process(options) url = options.fetch('url') format = options.fetch('format', "html") grab_images = options.fetch('grab_images', false) add_highlights = options.fetch('add_highlights', false) rewrite_urls = options.fetch('rewrite_urls', false) @grab_images = grab_images FileUtils.mkdir_p "_posts/tumblr" url += "/api/read/json/" per_page = 50 posts = [] # Two passes are required so that we can rewrite URLs. # First pass builds up an array of each post as a hash. begin current_page = (current_page || -1) + 1 feed_url = url + "?num=#{per_page}&start=#{current_page * per_page}" puts "Fetching #{feed_url}" feed = open(feed_url) contents = feed.readlines.join("\n") blog = extract_json(contents) puts "Page: #{current_page + 1} - Posts: #{blog["posts"].size}" batch = blog["posts"].map { |post| post_to_hash(post, format) } # If we're rewriting, save the posts for later. Otherwise, go ahead and # dump these to disk now if rewrite_urls posts += batch else batch.each {|post| write_post(post, format == "md", add_highlights)} end end until blog["posts"].size < per_page # Rewrite URLs, create redirects and write out out posts if necessary if rewrite_urls posts = rewrite_urls_and_redirects posts posts.each {|post| write_post(post, format == "md", add_highlights)} end end
# File lib/bunto-import/importers/tumblr.rb, line 4 def self.require_deps BuntoImport.require_with_fallback(%w[ rubygems fileutils open-uri nokogiri json uri time bunto ]) end
# File lib/bunto-import/importers/tumblr.rb, line 17 def self.specify_options(c) c.option 'url', '--url URL', 'Tumblr URL' c.option 'format', '--format FORMAT', 'Output format (default: "html")' c.option 'grab_images', '--grab_images', 'Whether to grab images (default: false)' c.option 'add_highlights', '--add_highlights', 'Whether to add highlights (default: false)' c.option 'rewrite_urls', '--rewrite_urls', 'Whether to rewrite URLs (default: false)' end
Private Class Methods
Adds pygments highlight tags to code blocks in posts that use markdown format. This doesn't guess the language of the code block, so you should modify this to suit your own content. For example, my code block only contain Python and JavaScript, so I can assume the block is JavaScript if it contains a semi-colon.
# File lib/bunto-import/importers/tumblr.rb, line 261 def self.add_syntax_highlights(content, redirect_dir) lines = content.split("\n") block, indent, lang, start = false, /^ /, nil, nil lines.each_with_index do |line, i| if !block && line =~ indent block = true lang = "python" start = i elsif block lang = "javascript" if line =~ /;$/ block = line =~ indent && i < lines.size - 1 # Also handle EOF if !block lines[start] = "{% highlight #{lang} %}" lines[i - 1] = "{% endhighlight %}" end FileUtils.cp(redirect_dir + "index.html", redirect_dir + "../" + "index.html") lines[i] = lines[i].sub(indent, "") end end lines.join("\n") end
# File lib/bunto-import/importers/tumblr.rb, line 68 def self.extract_json(contents) beginning = contents.index("{") ending = contents.rindex("}")+1 json = contents[beginning...ending] # Strip Tumblr's JSONP chars. blog = JSON.parse(json) end
Attempts to fetch the largest version of a photo available for a post. If that file fails, it tries the next smaller size until all available photo URLs are exhausted. If they all fail, the import is aborted.
# File lib/bunto-import/importers/tumblr.rb, line 186 def self.fetch_photo(post) sizes = post.keys.map {|k| k.gsub("photo-url-", "").to_i} sizes.sort! {|a,b| b <=> a} ext_key, ext_val = post.find do |k,v| k =~ /^photo-url-/ && v.split("/").last =~ /\./ end ext = "." + ext_val.split(".").last sizes.each do |size| url = post["photo-url"] || post["photo-url-#{size}"] next if url.nil? begin return "<img src=\"#{save_photo(url, ext)}\"/>" rescue OpenURI::HTTPError => err puts "Failed to grab photo" end end abort "Failed to fetch photo for post #{post['url']}" end
Convert preserving HTML tables as per the markdown docs.
# File lib/bunto-import/importers/tumblr.rb, line 241 def self.html_to_markdown(content) preserve = ["table", "tr", "th", "td"] preserve.each do |tag| content.gsub!(/<#{tag}/i, "$$" + tag) content.gsub!(/<\/#{tag}/i, "||" + tag) end content = Nokogiri::HTML(content.gsub("'", "''")).text preserve.each do |tag| content.gsub!("$$" + tag, "<" + tag) content.gsub!("||" + tag, "</" + tag) end content end
Converts each type of Tumblr
post to a hash with all required data for Bunto
.
# File lib/bunto-import/importers/tumblr.rb, line 98 def self.post_to_hash(post, format) case post['type'] when "regular" title = post["regular-title"] content = post["regular-body"] when "link" title = post["link-text"] || post["link-url"] content = "<a href=\"#{post["link-url"]}\">#{title}</a>" unless post["link-description"].nil? content << "<br/>" + post["link-description"] end when "photo" title = post["slug"].gsub("-"," ") if post["photos"].size > 1 content = "" post["photos"].each do |post_photo| photo = fetch_photo post_photo content << photo + "<br/>" content << post_photo["caption"] end else content = fetch_photo post end content << "<br/>" + post["photo-caption"] when "audio" if !post["id3-title"].nil? title = post["id3-title"] content = post["audio-player"] + "<br/>" + post["audio-caption"] else title = post["audio-caption"] content = post["audio-player"] end when "quote" title = post["quote-text"] content = "<blockquote>#{post["quote-text"]}</blockquote>" unless post["quote-source"].nil? content << "—" + post["quote-source"] end when "conversation" title = post["conversation-title"] content = "<section><dialog>" post["conversation"].each do |line| content << "<dt>#{line['label']}</dt><dd>#{line['phrase']}</dd>" end content << "</dialog></section>" when "video" title = post["video-title"] content = post["video-player"] unless post["video-caption"].nil? if content content << "<br/>" + post["video-caption"] else content = post["video-caption"] end end when "answer" title = post["question"] content = post["answer"] end date = Date.parse(post['date']).to_s title = Nokogiri::HTML(title).text title = "no title" if title.empty? slug = if post["slug"] && post["slug"].strip != "" post["slug"] elsif title && title.downcase.gsub(/[^a-z0-9\-]/, '') != '' && title != 'no title' slug = title.downcase.strip.gsub(' ', '-').gsub(/[^a-z0-9\-]/, '') slug.length > 200 ? slug.slice(0..200) : slug else slug = post['id'] end { :name => "#{date}-#{slug}.#{format}", :header => { "layout" => "post", "title" => title, "date" => Time.parse(post['date']).xmlschema, "tags" => (post["tags"] or []), "tumblr_url" => post["url-with-slug"] }, :content => content, :url => post["url"], :slug => post["url-with-slug"], } end
Create a Hash of old urls => new urls, for rewriting and redirects, and replace urls in each post. Instantiate Bunto
site/posts to get the correct permalink format.
# File lib/bunto-import/importers/tumblr.rb, line 211 def self.rewrite_urls_and_redirects(posts) site = Bunto::Site.new(Bunto.configuration({})) urls = Hash[posts.map { |post| # Create an initial empty file for the post so that # we can instantiate a post object. tumblr_url = URI.parse(URI.encode(post[:slug])).path bunto_url = if Bunto.const_defined? :Post File.open("_posts/tumblr/#{post[:name]}", "w") { |f| f.puts } Bunto::Post.new(site, Dir.pwd, "", "tumblr/" + post[:name]).url else Bunto::Document.new(File.expand_path("tumblr/#{post[:name]}"), site: site, collection: site.posts).url end redirect_dir = tumblr_url.sub(/\//, "") + "/" FileUtils.mkdir_p redirect_dir File.open(redirect_dir + "index.html", "w") do |f| f.puts "<html><head><link rel=\"canonical\" href=\"" + "#{bunto_url}\"><meta http-equiv=\"refresh\" content=\"0; " + "url=#{bunto_url}\"></head><body></body></html>" end [tumblr_url, bunto_url] }] posts.map { |post| urls.each do |tumblr_url, bunto_url| post[:content].gsub!(/#{tumblr_url}/i, bunto_url) end post } end
# File lib/bunto-import/importers/tumblr.rb, line 283 def self.save_photo(url, ext) if @grab_images path = "tumblr_files/#{url.split('/').last}" path += ext unless path =~ /#{ext}$/ FileUtils.mkdir_p "tumblr_files" # Don't fetch if we've already cached this file unless File.size? path puts "Fetching photo #{url}" File.open(path, "w") { |f| f.write(open(url).read) } end url = "/" + path end url end
Writes a post out to disk
# File lib/bunto-import/importers/tumblr.rb, line 76 def self.write_post(post, use_markdown, add_highlights) content = post[:content] if content if use_markdown content = html_to_markdown content if add_highlights tumblr_url = URI.parse(post[:slug]).path redirect_dir = tumblr_url.sub(/\//, "") + "/" FileUtils.mkdir_p redirect_dir content = add_syntax_highlights(content, redirect_dir) end end File.open("_posts/tumblr/#{post[:name]}", "w") do |f| f.puts post[:header].to_yaml + "---\n" + content end end end