git.oblomov.eu Git - ohcount/blob - test/detect_files/findup

   1 #!/bin/bash
   2
   3 # findup - find duplicate files
   4 # Copyright © 2000-2006 by Pádraig Brady <P@draigBrady.com>.
   5 #
   6 # This program is free software; you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation; either version 2 of the License, or
   9 # any later version.
  10 #
  11 # This program is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  14 # See the GNU General Public License for more details,
  15 # which is available at www.gnu.org
  16
  17
  18 # Description
  19 #
  20 #       will show duplicate files in the specified directories
  21 #       (and their subdirectories), in the format:
  22 #
  23 #       2 * 2048        file1 file2
  24 #       3 * 1024        file3 file4 file5
  25 #       2 * 1024        file6 file7
  26 #
  27 #       Where the number is the disk usage in bytes of each of the
  28 #       duplicate files on that line, and all duplicate files are
  29 #       shown on the same line.
  30 #               Output it ordered by largest disk usage first and
  31 #       then by the number of duplicate files.
  32 #
  33 # Caveats/Notes:
  34 #       I compared this to any equivalent utils I could find (as of Nov 2000)
  35 #       and it's (by far) the fastest, has the most functionality (thanks to
  36 #       find) and has no (known) bugs. In my opinion fdupes is the next best but
  37 #       is slower (even though written in C), and has a bug where hard links
  38 #       in different directories are reported as duplicates sometimes.
  39 #
  40 #       This script requires uniq > V2.0.21 (part of GNU textutils|coreutils)
  41 #       undefined operation if any dir/file names contain \n or \\
  42 #       sparse files are not treated differently.
  43 #       Don't specify params to find that affect output etc. (e.g -printf etc.)
  44 #       zero length files are ignored.
  45 #       symbolic links are ignored.
  46 #       path1 & path2 can be files &/or directories
  47
  48 script_dir=`dirname $0`                #directory of this script
  49 script_dir=`readlink -f "$script_dir"` #Make sure absolute path
  50
  51 . $script_dir/supprt/fslver
  52
  53 Usage() {
  54         ProgName=`basename "$0"`
  55         echo "find dUPlicate files.
  56 Usage: $ProgName [[-t [-m|-d]] [-r] [-f] paths(s) ...]
  57
  58 If no path(s) specified then the currrent directory is assumed.
  59
  60 When -m is specified any found duplicates will be merged (using hardlinks).
  61 When -d is specified any found duplicates will be deleted (only 1 left).
  62 When -t is specfied, only report what -m or -d would do.
  63
  64 You can also pipe output to $script_dir/fstool/dupwaste to
  65 get a total of the wastage due to duplicates.
  66
  67 Examples:
  68
  69 search for duplicates in current directory and below
  70         findup or findup .
  71 search for duplicates in all linux source directories and merge using hardlinks
  72         findup -m /usr/src/linux*
  73 same as above but don't look in subdirectories
  74         findup -r .
  75 search for duplicates in /usr/bin
  76         findup /usr/bin
  77 search in multiple directories but not their subdirectories
  78         findup -r /usr/bin /bin /usr/sbin /sbin
  79 search for duplicates in \$PATH
  80         findup \`$script_dir/supprt/getffp\`
  81 search system for duplicate files over 100K in size
  82         findup / -size +100k
  83 search only my files (that I own and are in my home dir)
  84         findup ~ -user \`id -u\`
  85 search system for duplicate files belonging to roger
  86         findup / -user \`id -u roger\`"
  87         exit
  88 }
  89
  90 for arg
  91 do
  92         case "$arg" in
  93         -h|--help|-help)
  94                 Usage ;;
  95         -v|--version)
  96                 Version ;;
  97         --gui)
  98                 mode="gui" ;;
  99         -m)
 100                 mode="merge" ;;
 101         -d)
 102                 mode="del" ;;
 103         -t)
 104                 t="t" ;;
 105         *)
 106                 argsToPassOn="$argsToPassOn '$arg'"
 107         esac
 108 done
 109 [ "$mode" = "merge" ] && argsToPassOn="$argsToPassOn -xdev"
 110
 111 if [ ! -z "$mode" ]; then
 112     forceFullPath="-f"
 113     sep_mode="prepend"
 114 else
 115     sep_mode="none"
 116 fi
 117
 118 if [ "$mode" = "gui" ] || [ "$mode" = "merge" ] || [ "$mode" = "del" ]; then
 119     merge_early=""    #process hardlinks
 120 else
 121     merge_early="-u"  #ignore hardlinks
 122 fi
 123
 124 . $script_dir/supprt/getfpf $forceFullPath "$argsToPassOn"
 125
 126 check_uniq
 127
 128 if [ `find . -maxdepth 0 -printf "%D" 2> /dev/null` = "D" ]
 129 then
 130     devFmt="\060" #0
 131 else
 132     devFmt=%D #This is new and will help find more duplicate files
 133 fi
 134                      #print name, inode & size.
 135 find "$@" -size +0c -type f -printf "$FPF\0$devFmt\0%i\0%s\n" |
 136 tr ' \t\0' '\0\1 ' | #remove spaces, tabs in file names
 137 sort -k4,4nr -k2,2n -k3,3 $merge_early |#group [and merge] size,dev & inodes
 138 if [ -z "$merge_early" ]; then
 139     $script_dir/supprt/rmlint/merge_hardlinks
 140 else
 141     uniq -3 -D       #pick just duplicate filesizes
 142 fi |
 143 sort -k2,2n -k3,3n | #NB sort inodes so md5sum does less seeking all over disk
 144 cut -f1 -d' ' -s |   #get filenames to work on
 145 tr '\0\1\n' ' \t\0' |#reset any space & tabs etc and delimit names with \0
 146
 147 # The following optional block, md5sums a small sample of each file,
 148 # which can help when there are many files of the same size,
 149 # even more so if they are large. This usually adds a small amount of
 150 # runtime, however it can save a large amount of time in certain situations.
 151 if $script_dir/supprt/md5sum_approx; then
 152     xargs -r0 $script_dir/supprt/md5sum_approx |
 153     sort |                     #group duplicate files together
 154     uniq --all-repeated -w32 | #pick just duplicates
 155     cut -d' ' -f3- |           #get filenames
 156     sort |                     #sort by paths to try to minimise disk seeks
 157     tr '\n' '\0'               #delimit names with \0
 158 else
 159     cat
 160 fi |
 161
 162 # This block selects duplicates using md5sum of whole file
 163 xargs -r0 md5sum -- |      #calculate md5sums for possible duplicates
 164 sort |                     #group duplicate files together
 165 uniq --all-repeated -w32 | #pick just duplicates
 166
 167 # The following optional block, checks duplicates again using sha1
 168 # Note for data sets that don't totally fit in cache this will
 169 # probably read duplicate files off the disk again.
 170 cut -d' ' -f3- |           #get filenames
 171 sort |                     #sort by paths to try to minimise disk seeks
 172 tr '\n' '\0' |             #delimit names with \0
 173 xargs -r0 sha1sum -- |     #to be sure to be sure
 174 sort |                     #group duplicate files together
 175 uniq --all-repeated=$sep_mode -w40 | #pick just duplicates
 176
 177 if [ ! -z "$mode" ]; then
 178   cut -d' ' -f3- |
 179   if [ ! $mode = "gui" ]; then # external call to python as this is faster
 180     if $script_dir/supprt/rmlint/fixdup.py < /dev/null; then
 181         $script_dir/supprt/rmlint/fixdup.py $t$mode
 182     elif $script_dir/supprt/rmlint/fixdup.sh < /dev/null; then
 183         $script_dir/supprt/rmlint/fixdup.sh $t$mode
 184     else
 185         echo "Error, couldn't execute merge util" >&2
 186         exit 1
 187     fi
 188   else
 189     cat
 190   fi
 191 else
 192 (
 193 psum='no match'
 194 line=''
 195 declare -i counter
 196 while read sum file; do           #sum is delimited by first space
 197   if [ "$sum" != "$psum" ]; then
 198     if [ ! -z "$line" ]; then
 199        echo "$counter * $line"
 200     fi
 201     counter=1
 202     line="`du -b "$file"`"
 203     psum="$sum"
 204   else
 205     counter=counter+1             #Use bash arithmetic, not expr (for speed)
 206     line="$line $file"
 207   fi
 208 done
 209
 210 if [ ! -z "$line" ]; then
 211   echo "$counter * $line"
 212 fi
 213 ) |
 214 sort -k3,3 -k1,1 -brn
 215 fi