各位用户为了找寻关于python查找指定具有相同内容文件的方法的资料费劲了很多周折。这里教程网为您整理了关于python查找指定具有相同内容文件的方法的相关资料,仅供查阅,以下为您介绍关于python查找指定具有相同内容文件的方法的详细内容
本文实例讲述了python查找指定具有相同内容文件的方法。分享给大家供大家参考。具体如下:
python代码用于查找指定具有相同内容的文件,可以同时指定多个目录 调用方式:python doublesdetector.py c:;d:;e: > doubles.txt
? 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85# Hello, this script is written in Python - http://www.python.org
# doublesdetector.py 1.0p
import
os, os.path, string, sys, sha
message
=
"""
doublesdetector.py 1.0p
This script will search for files that are identical
(whatever their name/date/time).
Syntax : python %s <directories>
where <directories> is a directory or a list of directories
separated by a semicolon (;)
Examples : python %s c:windows
python %s c:;d:;e: > doubles.txt
python %s c:program files > doubles.txt
This script is public domain. Feel free to reuse and tweak it.
The author of this script Sebastien SAUVAGE <sebsauvage at sebsauvage dot net>
http://sebsauvage.net/python/
"""
%
((sys.argv[
0
], )
*
4
)
def
fileSHA ( filepath ) :
""" Compute SHA (Secure Hash Algorythm) of a file.
Input : filepath : full path and name of file (eg. 'c:windowsemm386.exe')
Output : string : contains the hexadecimal representation of the SHA of the file.
returns '0' if file could not be read (file not found, no read rights...)
"""
try
:
file
=
open
(filepath,
'rb'
)
digest
=
sha.new()
data
=
file
.read(
65536
)
while
len
(data) !
=
0
:
digest.update(data)
data
=
file
.read(
65536
)
file
.close()
except
:
return
'0'
else
:
return
digest.hexdigest()
def
detectDoubles( directories ):
fileslist
=
{}
# Group all files by size (in the fileslist dictionnary)
for
directory
in
directories.split(
';'
):
directory
=
os.path.abspath(directory)
sys.stderr.write(
'Scanning directory '
+
directory
+
'...'
)
os.path.walk(directory,callback,fileslist)
sys.stderr.write(
'n'
)
sys.stderr.write(
'Comparing files...'
)
# Remove keys (filesize) in the dictionnary which have only 1 file
for
(filesize,listoffiles)
in
fileslist.items():
if
len
(listoffiles)
=
=
1
:
del
fileslist[filesize]
# Now compute SHA of files that have the same size,
# and group files by SHA (in the filessha dictionnary)
filessha
=
{}
while
len
(fileslist)>
0
:
(filesize,listoffiles)
=
fileslist.popitem()
for
filepath
in
listoffiles:
sys.stderr.write(
'.'
)
sha
=
fileSHA(filepath)
if
filessha.has_key(sha):
filessha[sha].append(filepath)
else
:
filessha[sha]
=
[filepath]
if
filessha.has_key(
'0'
):
del
filessha[
'0'
]
# Remove keys (sha) in the dictionnary which have only 1 file
for
(sha,listoffiles)
in
filessha.items():
if
len
(listoffiles)
=
=
1
:
del
filessha[sha]
sys.stderr.write(
'n'
)
return
filessha
def
callback(fileslist,directory,files):
sys.stderr.write(
'.'
)
for
fileName
in
files:
filepath
=
os.path.join(directory,fileName)
if
os.path.isfile(filepath):
filesize
=
os.stat(filepath)[
6
]
if
fileslist.has_key(filesize):
fileslist[filesize].append(filepath)
else
:
fileslist[filesize]
=
[filepath]
if
len
(sys.argv)>
1
:
doubles
=
detectDoubles(
" "
.join(sys.argv[
1
:]))
print
'The following files are identical:'
print
'n'
.join([
"----n%s"
%
'n'
.join(doubles[filesha])
for
filesha
in
doubles.keys()])
print
'----'
else
:
print
message
希望本文所述对大家的Python程序设计有所帮助。