#! /usr/bin/env ruby Main { Home = File.expand_path(ENV["HOME"] || ENV["USERPROFILE"] || "~") Basedir = File.join(Home, "mp3") description <<-txt mp3scrape will scour any url for it's mp3 content - the script mirrors, never downloading the same file twice txt example <<-txt 1) get a bunch of xmas tunes mp3scrape http://fuelfriends.blogspot.com/2007/12/christmas-mixery.html txt argument("uri"){ description "the uri to scrape" cast :uri } option("pattern", "p"){ description "specifiy the mp3 pattern" argument_required default %|['"](http://[^\\s]+[^/\\s]+.mp3)["']| } option("basedir", "b"){ description "specifiy the base download dir - default(#{ Basedir })" argument_required default Basedir } option("destination", "d"){ description "specifiy the absolute download dir - default(#{ File.join Basedir, 'auto-based-on-uri' })" argument_required } option("list"){ description "only list the mp3s that would be scraped" } option("noop", "n"){ description "show the downloads that would be performed" } def run uri = param["uri"].value pattern = %r/#{ param["pattern"].value }/ srcs = open(uri.to_s).read.scan(pattern).flatten.compact if param["list"].given? puts srcs exit end dsts = destinations_for srcs, param["destination"].value spec = srcs.zip dsts if param["noop"].given? spec.each{|src, dst| puts "#{ src } -> #{ dst }"} else mirror spec end end def mirror spec spec.each do |src, dst| FileUtils.mkdir_p(File.dirname(dst)) mtime = File.stat(dst).mtime rescue Time.at(0) open src do |fd| print src begin last_modified = fd.last_modified unless last_modified > mtime print " == " break end print " -> " data = fd.read and fd.close open(dst, "wb"){|fd| fd.write data} File.utime last_modified, last_modified, dst ensure puts dst end end end end def destinations_for srcs, destination = nil srcs.map do |src| basename = File.basename src basename = clean basename File.expand_path( if destination File.join destination, basename else uri = URI.parse src.to_s host, paths = uri.host, uri.path.split("/").map{|path| clean path} basename = clean paths.pop [ Basedir, host, paths, basename ].flatten.compact.join(File::SEPARATOR) end ) end end def clean basename CGI.unescape(basename.to_s).gsub(%r/[^0-9a-zA-Z_@)(~.-]/, '_').gsub(%r/_+/,'_') end } BEGIN { require "yaml" require "uri" require "open-uri" require "fileutils" require "cgi" begin require "rubygems" rescue LoadError 42 end begin require "main" rescue LoadError STDERR.puts "gem install main" exit 1 end STDERR.sync = STDOUT.sync = true trap("INT"){ exit } }