#!/usr/bin/perl # File : checkdups.pl # Author: Lyndon Hill, http://www.lyndonhill.com # Note : This script is free to use. If you make significant improvements please # send me a copy. # # Script to find duplicate entries # Usage: # checkdups.pl > duplicate-list # User editable variables # input dictionary file $dicfile = "en-ka.bedic"; # duplicates are case insensitive=1, case sensitive=0. (I don't use this) $english = 0; # End of user editable variables open(DICFILE, $dicfile); $ecount = 0; # total number of entries $lineno = 1; # number of line read from dictionary file $nextline = 0; # flag to say end of entry (next line is an entry) $duplicates = 0; # total number of duplicates while() { $line = $_; chop $line; if($nextline == 1) { # read the entry, record it's position $entry = $line; if($english == 1) { $entry = lc($entry); } # convert case (optional) # keep a hash of entries and list of duplicates if($dtable{$entry} ne "") { $dup[$ecount] = 1; $duplicates++; } else { $dup[$ecount] = 0; $dtable{$entry} = $ecount; } $dictionary[$ecount] = $entry; $location[$ecount] = $lineno; $ecount++; $nextline = 0; } else { if($line eq "") { # next line should be an entry $nextline = 1; if($ecount > 0) { # end previous entry $length[$ecount-1] = $lineno-1; } } } $lineno++; } if($line ne "") { # deal with last entry if($ecount > 0) { # end previous entry $length[$ecount-1] = $lineno-1; } } close(DICFILE); # output duplicate list for($d = 0; $d <= $ecount; $d++) { if($dup[$d] == 1) { $dp = $dtable{$dictionary[$d]}; print "Lines " . $location[$d] . " to " . $length[$d]; print " duplicated from " . $location[$dp] . " to " . $length[$dp] . " ($dictionary[$d])\n"; } } # [debug only] # print "$duplicates duplicates.\n"; # print "$ecount entries - Done.\n"; exit;