# Generated from medusa-crawler-1.0.0.gem by gem2rpm -*- rpm-spec -*- %global gem_name medusa-crawler Name: rubygem-%{gem_name} Version: 1.0.0 Release: 1%{?dist} Summary: Medusa is a ruby crawler framework License: MIT URL: https://github.com/brutuscat/medusa-crawler Source0: https://rubygems.org/gems/%{gem_name}-%{version}.gem BuildRequires: ruby(release) BuildRequires: rubygems-devel BuildRequires: ruby >= 2.3.0 BuildArch: noarch %description == Medusa: a ruby crawler framework {rdoc-image:https://badge.fury.io/rb/medusa-crawler.svg}[https://rubygems.org/gems/medusa-crawler] rdoc-image:https://github.com/brutuscat/medusa-crawler/workflows/Ruby/badge.svg?event=push Medusa is a framework for the ruby language to crawl and collect useful information about the pages it visits. It is versatile, allowing you to write your own specialized tasks quickly and easily. === Features * Choose the links to follow on each page with +focus_crawl+ * Multi-threaded design for high performance * Tracks +301+ HTTP redirects * Allows exclusion of URLs based on regular expressions * Records response time for each page * Obey _robots.txt_ directives (optional, but recommended) * In-memory or persistent storage of pages during crawl, provided by Moneta[https://github.com/moneta-rb/moneta] * Inherits OpenURI behavior (redirects, automatic charset and encoding detection, proxy configuration options). Do you have an idea or a suggestion? {Open an issue and talk about it}[https://github.com/brutuscat/medusa-crawler/issues/new] === Examples Medusa is versatile and to be used programatically, you can start with one or multiple URIs: require 'medusa' Medusa.crawl('https://www.example.com', depth_limit: 2) Or you can pass a block and it will yield the crawler back, to manage configuration or drive its crawling focus: require 'medusa' Medusa.crawl('https://www.example.com', depth_limit: 2) do |crawler| crawler.discard_page_bodies = some_flag # Persist all the pages state across crawl-runs. crawler.clear_on_startup = false crawler.storage = Medusa::Storage.Moneta(:Redis, 'redis://redis.host.name:6379/0') crawler.skip_links_like(/private/) crawler.on_pages_like(/public/) do |page| logger.debug "[public page] #{page.url} took #{page.response_time} found #{page.links.count}" end # Use an arbitrary logic, page by page, to continue customize the crawling. crawler.focus_crawl(/public/) do |page| page.links.first end end. . %package doc Summary: Documentation for %{name} Requires: %{name} = %{version}-%{release} BuildArch: noarch %description doc Documentation for %{name}. %prep %setup -q -n %{gem_name}-%{version} %build # Create the gem as gem install only works on a gem file gem build ../%{gem_name}-%{version}.gemspec # %%gem_install compiles any C extensions and installs the gem into ./%%gem_dir # by default, so that we can move it into the buildroot in %%install %gem_install %install mkdir -p %{buildroot}%{gem_dir} cp -a .%{gem_dir}/* \ %{buildroot}%{gem_dir}/ %check pushd .%{gem_instdir} # Run the test suite. popd %files %dir %{gem_instdir} %license %{gem_instdir}/LICENSE.txt %{gem_instdir}/VERSION %{gem_libdir} %exclude %{gem_cache} %{gem_spec} %files doc %doc %{gem_docdir} %doc %{gem_instdir}/CHANGELOG.md %doc %{gem_instdir}/CONTRIBUTORS.md %doc %{gem_instdir}/README.rdoc %{gem_instdir}/Rakefile %{gem_instdir}/spec %changelog * Tue Aug 24 2021 mockbuilder - 1.0.0-1 - Initial package