Follow these steps for finding or deleting duplicate files:
- Generate some test files:
$ echo "hello" > test ; cp test test_copy1 ; cp test test_copy2;
$ echo "next" > other;
# test_copy1 and test_copy2 are copy of test
- The code for the script to remove the duplicate files uses awk, an interpreter that's available on all Linux/Unix systems:
#!/bin/bash
#Filename: remove_duplicates.sh
#Description: Find and remove duplicate files and
# keep one sample of each file.
ls -lS --time-style=long-iso | awk 'BEGIN {
getline; getline;
name1=$8; size=$5
}
{
name2=$8;
if (size==$5)
{
"md5sum "name1 | getline; csum1=$1;
"md5sum "name2 | getline; csum2=$1;
if ( csum1==csum2 )
{
print name1; print name2
}
};
size=$5; name1=name2;
}' | sort -u > duplicate_files
cat duplicate_files | xargs -I {} md5sum {} | \
sort | uniq -w 32 | awk '{ print $2 }' | \
sort -u > unique_files
echo Removing..
comm duplicate_files unique_files -3 | tee /dev/stderr | \
xargs rm
echo Removed duplicates files successfully.
- Run the code as follows:
$ ./remove_duplicates.sh