3 # findup - find duplicate files
4 # Copyright © 2000-2006 by Pádraig Brady <P@draigBrady.com>.
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
14 # See the GNU General Public License for more details,
15 # which is available at www.gnu.org
20 # will show duplicate files in the specified directories
21 # (and their subdirectories), in the format:
23 # 2 * 2048 file1 file2
24 # 3 * 1024 file3 file4 file5
25 # 2 * 1024 file6 file7
27 # Where the number is the disk usage in bytes of each of the
28 # duplicate files on that line, and all duplicate files are
29 # shown on the same line.
30 # Output it ordered by largest disk usage first and
31 # then by the number of duplicate files.
34 # I compared this to any equivalent utils I could find (as of Nov 2000)
35 # and it's (by far) the fastest, has the most functionality (thanks to
36 # find) and has no (known) bugs. In my opinion fdupes is the next best but
37 # is slower (even though written in C), and has a bug where hard links
38 # in different directories are reported as duplicates sometimes.
40 # This script requires uniq > V2.0.21 (part of GNU textutils|coreutils)
41 # undefined operation if any dir/file names contain \n or \\
42 # sparse files are not treated differently.
43 # Don't specify params to find that affect output etc. (e.g -printf etc.)
44 # zero length files are ignored.
45 # symbolic links are ignored.
46 # path1 & path2 can be files &/or directories
48 script_dir=`dirname $0` #directory of this script
49 script_dir=`readlink -f "$script_dir"` #Make sure absolute path
51 . $script_dir/supprt/fslver
54 ProgName=`basename "$0"`
55 echo "find dUPlicate files.
56 Usage: $ProgName [[-t [-m|-d]] [-r] [-f] paths(s) ...]
58 If no path(s) specified then the currrent directory is assumed.
60 When -m is specified any found duplicates will be merged (using hardlinks).
61 When -d is specified any found duplicates will be deleted (only 1 left).
62 When -t is specfied, only report what -m or -d would do.
64 You can also pipe output to $script_dir/fstool/dupwaste to
65 get a total of the wastage due to duplicates.
69 search for duplicates in current directory and below
71 search for duplicates in all linux source directories and merge using hardlinks
72 findup -m /usr/src/linux*
73 same as above but don't look in subdirectories
75 search for duplicates in /usr/bin
77 search in multiple directories but not their subdirectories
78 findup -r /usr/bin /bin /usr/sbin /sbin
79 search for duplicates in \$PATH
80 findup \`$script_dir/supprt/getffp\`
81 search system for duplicate files over 100K in size
83 search only my files (that I own and are in my home dir)
84 findup ~ -user \`id -u\`
85 search system for duplicate files belonging to roger
86 findup / -user \`id -u roger\`"
106 argsToPassOn="$argsToPassOn '$arg'"
109 [ "$mode" = "merge" ] && argsToPassOn="$argsToPassOn -xdev"
111 if [ ! -z "$mode" ]; then
118 if [ "$mode" = "gui" ] || [ "$mode" = "merge" ] || [ "$mode" = "del" ]; then
119 merge_early="" #process hardlinks
121 merge_early="-u" #ignore hardlinks
124 . $script_dir/supprt/getfpf $forceFullPath "$argsToPassOn"
128 if [ `find . -maxdepth 0 -printf "%D" 2> /dev/null` = "D" ]
132 devFmt=%D #This is new and will help find more duplicate files
134 #print name, inode & size.
135 find "$@" -size +0c -type f -printf "$FPF\0$devFmt\0%i\0%s\n" |
136 tr ' \t\0' '\0\1 ' | #remove spaces, tabs in file names
137 sort -k4,4nr -k2,2n -k3,3 $merge_early |#group [and merge] size,dev & inodes
138 if [ -z "$merge_early" ]; then
139 $script_dir/supprt/rmlint/merge_hardlinks
141 uniq -3 -D #pick just duplicate filesizes
143 sort -k2,2n -k3,3n | #NB sort inodes so md5sum does less seeking all over disk
144 cut -f1 -d' ' -s | #get filenames to work on
145 tr '\0\1\n' ' \t\0' |#reset any space & tabs etc and delimit names with \0
147 # The following optional block, md5sums a small sample of each file,
148 # which can help when there are many files of the same size,
149 # even more so if they are large. This usually adds a small amount of
150 # runtime, however it can save a large amount of time in certain situations.
151 if $script_dir/supprt/md5sum_approx; then
152 xargs -r0 $script_dir/supprt/md5sum_approx |
153 sort | #group duplicate files together
154 uniq --all-repeated -w32 | #pick just duplicates
155 cut -d' ' -f3- | #get filenames
156 sort | #sort by paths to try to minimise disk seeks
157 tr '\n' '\0' #delimit names with \0
162 # This block selects duplicates using md5sum of whole file
163 xargs -r0 md5sum -- | #calculate md5sums for possible duplicates
164 sort | #group duplicate files together
165 uniq --all-repeated -w32 | #pick just duplicates
167 # The following optional block, checks duplicates again using sha1
168 # Note for data sets that don't totally fit in cache this will
169 # probably read duplicate files off the disk again.
170 cut -d' ' -f3- | #get filenames
171 sort | #sort by paths to try to minimise disk seeks
172 tr '\n' '\0' | #delimit names with \0
173 xargs -r0 sha1sum -- | #to be sure to be sure
174 sort | #group duplicate files together
175 uniq --all-repeated=$sep_mode -w40 | #pick just duplicates
177 if [ ! -z "$mode" ]; then
179 if [ ! $mode = "gui" ]; then # external call to python as this is faster
180 if $script_dir/supprt/rmlint/fixdup.py < /dev/null; then
181 $script_dir/supprt/rmlint/fixdup.py $t$mode
182 elif $script_dir/supprt/rmlint/fixdup.sh < /dev/null; then
183 $script_dir/supprt/rmlint/fixdup.sh $t$mode
185 echo "Error, couldn't execute merge util" >&2
196 while read sum file; do #sum is delimited by first space
197 if [ "$sum" != "$psum" ]; then
198 if [ ! -z "$line" ]; then
199 echo "$counter * $line"
202 line="`du -b "$file"`"
205 counter=counter+1 #Use bash arithmetic, not expr (for speed)
210 if [ ! -z "$line" ]; then
211 echo "$counter * $line"
214 sort -k3,3 -k1,1 -brn