#
# The following code is a direct translation into ruby of
# the sweet 'duplocator' code by John Hoogstrate, located at
#  http://brainerror.net/scripts/python/duplocator/
#
# I even conserved the fairly Pythonic idioms in Ruby
# (e.g. using "for...in" syntactic sugar for ".each") just
# for the fun of seeing how close I could make it.
# I copied the comments too.
#
# It's not *exactly* the same but it's as close as I could
# make it.  You may assume any errors are mine, not 
# Mr. Hoogstrate's, and all the awesomeness is his.
#

require 'set'
require 'find'
require 'YAML'

BLOCKSIZE = 1024 * 8

class Files
  attr_accessor :files, :open_files, :max_open_files
  def initialize
    @files = {}
    @open_files = 0
    @max_open_files = 512
  end

  def _open(fn, pos=0)
    if @open_files == @max_open_files
      _close_a_file
    end
    fh=File.open(fn)
    if pos > 0
      fh.seek(pos)
    end
    @files[fn] = { :file => fh, :pos => pos }
    @open_files += 1
    fh
  end

  def open(fn)
    if @files.include? fn
      f = @files[fn]
      if f[:file]
        return f[:file]
      end
      pos = f[:pos]
    else
      pos = 0
    end
    return _open(fn, pos)
  end

  def _close_a_file
    for k, v in @files
      if v[:file]
        close(k)
        return
      end
    end
  end

  def close(fn, remove=nil)
    f = @files[fn]
    fh = f[:file]
    if fh and not fh.closed?
      pos = fh.tell
      fh.close
      @open_files -= 1
    else
      remove = true
    end

    if remove or pos == 0
      @files.delete(fn)
    else
      f[:pos] = pos
      f[:file] = nil
    end
  end

end

class Dupfinder
  def initialize(dirs=nil)
    @files = {}
    @inodes = Set.new
    @file_handles = Files.new
    @dirs = []
    @bytes_read = 0
    @group = 1
    @num_files = 0
    if dirs
      add_dirs(dirs)
    end
  end

  def add_dirs(dirs)
    @dirs.concat(dirs)
  end

  def _add_file(fn)
    return if File.symlink?(fn)

    stat = File.stat(fn)
    size = stat.size
    tup = "#{stat.dev} #{stat.ino}" # ok so we don't have tuples in ruby :)
    return unless @inodes.add?(tup) # but Set#add? is kind of cool :)
    @files[size] ||= []
    @files[size] << fn
  end

  def _walk_dir(dir)
    Find.find(dir) do | path |
      fn =  path
      if not File.directory? fn
        _add_file(fn)
        @num_files += 1
      end
    end
  end


  def _build_flist
    for dir in @dirs
      _walk_dir(dir)
    end
    chain = @files.values.map { | v | v.length }
    if chain.length > 0
      long = chain.max
    else
      long = 0
    end
    @longest_chain = long
  end

  def _clear_singles
    singles = []
    for size, files in @files
      if @files.length == 1
        singles << size
      end
    end
    for size in singles
      @files.delete(size) 
    end
    @groups = @files.length
  end

  def _get_fh(path)
    return @file_handles.open(path)
  end
  
  def _close_fh(path)
    @file_handles.close(path, remove=true)
  end

  def _identical(files, split=nil)
    while 1
      chunks = {}
      chains = []
      for p in files
        f = _get_fh(p)
        chunk = f.read(BLOCKSIZE)
        @bytes_read += BLOCKSIZE
        chunks[chunk] ||= []
        chunks[chunk] << p
      end

      #test for EOF on all files - means I found duplicate files
      if chunks.length == 1 and chunks.include? nil
        for fn in files
          _close_fh(fn)
        end
        if not split
          return [files]
        else
          return files
        end
      end

      for matches in chunks.values
        if matches.length != 1
          #matches - a list of filenames, are possibly duplicates
          chains << matches
        else
          #for a chunk that had no dupes, I can stop reading this file
          fn = matches[0]
          _close_fh(fn)
          files.delete(fn)
        end
      end
      #say I have 4 files, and 1 == 3 and 2 == 4, but 1 != 2, I need to
      #re-call this function with [1,3] and [2,4] as files
      #otherwise, all the files were the same so far, and I can stay 
      #in this loop

      if chains.length != 1
        break # out of while loop
      end
    end
    
    same = []
    for chain in chains
      same << _identical(chain, true)
    end

    return same
  end


  def _is_dup(size, files)
    if size==0
      return [files]
    end
    return _identical(files)
  end

  def find_dups
    _build_flist
    _clear_singles
    dupes = []
    for size, files in @files
      same = _is_dup(size, files)
      dupes.concat(same)
      if block_given?
        _progress { | *args | yield *args }
      end
    end
    dupes.reject! { | list | list.empty? }
    return dupes
  end

  def _progress
    yield @group, @groups
    @group += 1
  end
end


Shoes.app do
  def find_dupes
    if File.directory?(@dupefolder)
      d = Dupfinder.new()
      d.add_dirs([@dupefolder])
      @dupes = d.find_dups
      @results.append do
        para "done"
        para @dupes.to_yaml
      end
    else
      @results.append { para "search WHERE for dupes?" }
    end
  end

  stack do
    para "Search for dupes in folder...."
    @dupelabel = stack
    button "Choose Folder...." do
      @dupefolder = ask_open_folder 
      if File.directory? @dupefolder.to_s
        @dupelabel.clear { para @dupefolder }
      end
    end
    button "search for dupes" do
      if @dupefolder
        @results.clear do
          para "searching for dupes in #{@dupefolder}"
          para "be patient, this can take a little while..."
        end
        timer (1) do # to avoid doing this inside the button handler!
          find_dupes
        end
      end
    end
    button "save results in yaml format" do
      if @dupes
        savefile = ask_save_file
        debug "saving in #{savefile}"
        save_handle = File.open savefile, 'w'
        YAML.dump @dupes, save_handle
        save_handle.close
      end
    end
    @results = stack
  end
end

