利用dejavu声音指纹清理重复音乐文件#
项目地址#
使用方法#
from dejavu import Dejavu
config = {
"database": {
"host": "127.0.0.1",
"user": "root",
"password": "Password123",
"database": "dejavu_db",
},
"database_type" : "mysql",
"fingerprint_limit" : 10
}
djv = Dejavu(config)
djv.fingerprint_directory("music/path", [".mp3",".m4a",".wav"], 3)
音乐识别#
from dejavu.logic.recognizer.file_recognizer import FileRecognizer
song = djv.recognize(FileRecognizer, "music/path/YQJO.mp3")
res=song['results']
song_id=res[0]['song_id']
song_name=res[0]['song_name'].decode('utf-8')
print(f'''{song_id} -- {song_name}''')
数据结构#
系统会创建2张表:songs
和fingerprints
,系统扫描到重复音乐时,会打印重复信息,但是并没有记录到数据库表中。而songs
表也没有文件位置信息。
不便于筛选重复音乐文件。所以,增加了song_file_info
,以记录文件信息。
增加音乐信息#
diff --git a/dejavu/__init__.py b/dejavu/__init__.py
index fac72bc..689c541 100755
--- a/dejavu/__init__.py
+++ b/dejavu/__init__.py
@@ -84,8 +84,12 @@ class Dejavu:
filenames_to_fingerprint = []
for filename, _ in decoder.find_files(path, extensions):
# don't refingerprint already fingerprinted files
- if decoder.unique_hash(filename) in self.songhashes_set:
+ file_hash=decoder.unique_hash(filename)
+ if file_hash in self.songhashes_set:
print(f"{filename} already fingerprinted, continuing...")
+ file_size = os.path.getsize(filename)
+ song_name, extension = os.path.splitext(os.path.basename(filename))
+ self.db.insert_song_file_info(song_name, file_hash,total_hashes=0,song_path=filename,song_size=file_size)
continue
filenames_to_fingerprint.append(filename)
@@ -99,7 +103,7 @@ class Dejavu:
# Loop till we have all of them
while True:
try:
- song_name, hashes, file_hash = next(iterator)
+ song_name, hashes, file_hash,extension,filename,file_size = next(iterator)
except multiprocessing.TimeoutError:
continue
except StopIteration:
@@ -111,6 +115,8 @@ class Dejavu:
else:
sid = self.db.insert_song(song_name, file_hash, len(hashes))
+ self.db.insert_song_file_info(song_name, file_hash,total_hashes=len(hashes),song_path=filename,song_size=file_size)
+
self.db.insert_hashes(sid, hashes)
self.db.set_song_fingerprinted(sid)
self.__load_fingerprinted_audio_hashes()
@@ -132,14 +138,18 @@ class Dejavu:
# don't refingerprint already fingerprinted files
if song_hash in self.songhashes_set:
print(f"{song_name} already fingerprinted, continuing...")
+ file_size = os.path.getsize(file_path)
+ song_name, extension = os.path.splitext(os.path.basename(file_path))
+ self.db.insert_song_file_info(song_name, file_hash,total_hashes=0,song_path=file_path,song_size=file_size)
else:
- song_name, hashes, file_hash = Dejavu._fingerprint_worker(
+ song_name, hashes, file_hash,file_size = Dejavu._fingerprint_worker(
file_path,
self.limit,
song_name=song_name
)
sid = self.db.insert_song(song_name, file_hash)
+ self.db.insert_song_file_info(song_name, file_hash,total_hashes=len(hashes),song_path=file_path,song_size=file_size)
self.db.insert_hashes(sid, hashes)
self.db.set_song_fingerprinted(sid)
self.__load_fingerprinted_audio_hashes()
@@ -238,7 +248,9 @@ class Dejavu:
fingerprints, file_hash = Dejavu.get_file_fingerprints(file_name, limit, print_output=True)
- return song_name, fingerprints, file_hash
+ file_size = os.path.getsize(file_name)
+
+ return song_name, fingerprints, file_hash,extension,file_name,file_size
@staticmethod
def get_file_fingerprints(file_name: str, limit: int, print_output: bool = False):
diff --git a/dejavu/base_classes/base_database.py b/dejavu/base_classes/base_database.py
index 839a72a..bbd2378 100755
--- a/dejavu/base_classes/base_database.py
+++ b/dejavu/base_classes/base_database.py
@@ -118,6 +118,19 @@ class BaseDatabase(object, metaclass=abc.ABCMeta):
"""
pass
+ @abc.abstractmethod
+ def insert_song_file_info(self, song_name: str, file_hash: str, total_hashes: int,song_id: int,song_path:str,song_size:int) -> int:
+ """
+ Inserts a song name into the database, returns the new
+ identifier of the song.
+
+ :param song_name: The name of the song.
+ :param file_hash: Hash from the fingerprinted file.
+ :param total_hashes: amount of hashes to be inserted on fingerprint table.
+ :return: the inserted id.
+ """
+ pass
+
@abc.abstractmethod
def query(self, fingerprint: str = None) -> List[Tuple]:
"""
diff --git a/dejavu/base_classes/common_database.py b/dejavu/base_classes/common_database.py
index e884285..8dcdf95 100644
--- a/dejavu/base_classes/common_database.py
+++ b/dejavu/base_classes/common_database.py
@@ -34,6 +34,7 @@ class CommonDatabase(BaseDatabase, metaclass=abc.ABCMeta):
cur.execute(self.CREATE_SONGS_TABLE)
cur.execute(self.CREATE_FINGERPRINTS_TABLE)
cur.execute(self.DELETE_UNFINGERPRINTED)
+ cur.execute(self.CREATE_SONGS_FILE_INFO_TABLE)
def empty(self) -> None:
"""
@@ -124,6 +125,18 @@ class CommonDatabase(BaseDatabase, metaclass=abc.ABCMeta):
Inserts a song name into the database, returns the new
identifier of the song.
+ :param song_name: The name of the song.
+ :param file_hash: Hash from the fingerprinted file.
+ :param total_hashes: amount of hashes to be inserted on fingerprint table.
+ :return: the inserted id.
+ """
+ pass
+ @abc.abstractmethod
+ def insert_song_file_info(self, song_name: str, file_hash: str, total_hashes: int,song_id: int,song_path:str,song_size:int) -> int:
+ """
+ Inserts a song name into the database, returns the new
+ identifier of the song.
+
:param song_name: The name of the song.
:param file_hash: Hash from the fingerprinted file.
:param total_hashes: amount of hashes to be inserted on fingerprint table.
diff --git a/dejavu/config/settings.py b/dejavu/config/settings.py
index 0e20569..4f986ab 100644
--- a/dejavu/config/settings.py
+++ b/dejavu/config/settings.py
@@ -40,6 +40,15 @@ FIELD_FINGERPRINTED = "fingerprinted"
FIELD_FILE_SHA1 = 'file_sha1'
FIELD_TOTAL_HASHES = 'total_hashes'
+
+# TABLE SONG_FILE_INFO
+SONG_FILE_INFO_TABLENAME = "song_file_info"
+# SONG_FILE_INFO FIELDS
+FIELD_FILE_SHA1 = 'file_sha1'
+FIELD_TOTAL_HASHES = 'total_hashes'
+FIELD_FILE_PATH = 'file_path'
+FIELD_FILE_SIZE = 'file_size'
+
# TABLE FINGERPRINTS
FINGERPRINTS_TABLENAME = "fingerprints"
diff --git a/dejavu/database_handler/mysql_database.py b/dejavu/database_handler/mysql_database.py
index 1a8c506..72df2e6 100755
--- a/dejavu/database_handler/mysql_database.py
+++ b/dejavu/database_handler/mysql_database.py
@@ -7,7 +7,7 @@ from dejavu.base_classes.common_database import CommonDatabase
from dejavu.config.settings import (FIELD_FILE_SHA1, FIELD_FINGERPRINTED,
FIELD_HASH, FIELD_OFFSET, FIELD_SONG_ID,
FIELD_SONGNAME, FIELD_TOTAL_HASHES,
- FINGERPRINTS_TABLENAME, SONGS_TABLENAME)
+ FINGERPRINTS_TABLENAME, SONGS_TABLENAME,SONG_FILE_INFO_TABLENAME,FIELD_FILE_PATH,FIELD_FILE_SIZE)
class MySQLDatabase(CommonDatabase):
@@ -43,6 +43,21 @@ class MySQLDatabase(CommonDatabase):
) ENGINE=INNODB;
"""
+ CREATE_SONGS_FILE_INFO_TABLE = f"""
+ CREATE TABLE IF NOT EXISTS `{SONG_FILE_INFO_TABLENAME}` (
+ `{FIELD_SONG_ID}` MEDIUMINT UNSIGNED NOT NULL
+ , `{FIELD_SONGNAME}` VARCHAR(250) NOT NULL
+ , `{FIELD_FILE_SHA1}` BINARY(20) NOT NULL
+ , `{FIELD_TOTAL_HASHES}` INT NOT NULL DEFAULT 0
+ , `{FIELD_FILE_PATH}` VARCHAR(250) NOT NULL
+ , `{FIELD_FILE_SIZE}` INT NULL
+ , `date_created` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP
+ , `date_modified` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP
+ , CONSTRAINT `pk_{SONG_FILE_INFO_TABLENAME}_{FIELD_SONG_ID}` PRIMARY KEY (`{FIELD_SONG_ID}`)
+ , CONSTRAINT `uq_{SONG_FILE_INFO_TABLENAME}_{FIELD_SONG_ID}` UNIQUE KEY (`{FIELD_SONG_ID}`)
+ ) ENGINE=INNODB;
+ """
+
# INSERTS (IGNORES DUPLICATES)
INSERT_FINGERPRINT = f"""
INSERT IGNORE INTO `{FINGERPRINTS_TABLENAME}` (
@@ -57,6 +72,13 @@ class MySQLDatabase(CommonDatabase):
VALUES (%s, UNHEX(%s), %s);
"""
+ INSERT_SONG_FILE_INFO = f"""
+ INSERT INTO `{SONG_FILE_INFO_TABLENAME}` (`{FIELD_SONGNAME}`,`{FIELD_FILE_SHA1}`,`{FIELD_TOTAL_HASHES}`,
+ `{FIELD_FILE_PATH}`,`{FIELD_FILE_SIZE}`)
+ VALUES (%s, UNHEX(%s), %s,%s,%s);
+ """
+
+
# SELECTS
SELECT = f"""
SELECT `{FIELD_SONG_ID}`, `{FIELD_OFFSET}`
@@ -142,6 +164,12 @@ class MySQLDatabase(CommonDatabase):
cur.execute(self.INSERT_SONG, (song_name, file_hash, total_hashes))
return cur.lastrowid
+
+ def insert_song_file_info(self, song_name: str, file_hash: str, total_hashes: int,song_path:str,song_size:int) -> int:
+ with self.cursor() as cur:
+ cur.execute(self.INSERT_SONG_FILE_INFO, (song_name, file_hash, total_hashes,song_path,song_size))
+ return cur.lastrowid
+
def __getstate__(self):
return self._options,
筛选重复音乐语句#
select s.song_id,s.song_name,f.song_name,f.file_path,f.file_size,s.total_hashes
from dejavu.songs s left join dejavu.song_file_info f on s.file_sha1=f.file_sha1
where s.song_id in
(
select t.song_id from (select s.song_id,count(s.song_id)
from dejavu.songs s left join dejavu.song_file_info f on s.file_sha1=f.file_sha1
group by s.song_id
having count(*)>1)t
)
order by s.file_sha1;
修改文件名或者标题#
检查否乱码#
is_garbled.py
# -*- coding: utf-8 -*-
import re
import argparse
parser = argparse.ArgumentParser(description='manual to this script')
parser.add_argument("--text", type=str, default='')
args = parser.parse_args()
def is_garbled(text):
pattern = re.compile(r'[^\u0021-\u007E \u4E00-\u9FFF]+')
if pattern.search(text):
return True
return False
def main():
text=args.text
result=is_garbled(text)
print(result)
return result
if __name__ == '__main__':
main()
修改文件标题或者文件名#
#!/bin/bash
help()
{
cat<<HELP
-p path -py is_garbled.py path
-h show this help
HELP
exit 0
}
[ -z $1 ] && { echo -h for help; exit; };
while [ -n $1 ]; do
case $1 in
-h) help; break;;
-p) path=$2;shift 2;;
-py) py_path=$2;shift 2;;
-*) echo Error: NO Such Option. -h for help. break;;
*)break;;
esac
done
[ -z $path ] && [ -z $py_path ] && { echo -h for help; exit; };
filelist=`ls $path`
old_ifs=$IFS
IFS=$'\n';
for file in $filelist
do
if [ -f $path"/"$file ];then
title_str=`ffprobe -v quiet -show_format -print_format json $path"/"$file |grep title` ;
IFS=':' ;
array=(${title_str});
title=`echo ${array[1]}|tr -d '"'|tr -d ','` ;
title="${title#"${title%%[![:space:]]*}"}";
if [ $title ]; then
is_garbled=$(python3 $py_path/is_garbled.py --text $title);
else
is_garbled="Empty";
fi
if [ $is_garbled = 'True' ] ;then
echo "garbled: $file" ;
# Extract the file name without the extension
new_title="${file%.*}" ;
echo "new_title: $new_title" ;
ffmpeg -i "$path/$file" -c copy -metadata title=$new_title "$path/new/$file" ;
#echo "$path/$file" -metadata title=$new_title "$path/new/$file";
elif [ $is_garbled = 'False' ] ;then
echo "not garbled: $file" ;
file_name=${file%.*}
length=${#file_name}
len_title=${#title}
ext_name=${file:$length}
if [ $length -eq 4 ] && [[ $file_name =~ [A-Z]{4} ]] && [ $len_title -gt 0 ] ;then
#if [ $length -eq 4 ] && [ echo "$file_name" | grep -qE '^[A-Z]{4}$' ] && [ $len_title -gt 0 ] ;then
new_file_name="${title}${ext_name}" ;
mv "$path/$file" "$path/new/$new_file_name" ;
echo "rename file_name: $file_name to new_file_name: $new_file_name" ;
fi
elif [ $is_garbled = 'Empty' ] ;then
echo "--------Empty Title: $path/$file ---------" ;
fi
fi
done
IFS=$old_ifs