利用dejavu声音指纹清理重复音乐文件#

项目地址#

原项目地址 项目地址

使用方法#

from dejavu import Dejavu
config = {
     "database": {
         "host": "127.0.0.1",
         "user": "root",
         "password": "Password123", 
         "database": "dejavu_db",
     },
     "database_type" : "mysql",
     "fingerprint_limit" : 10
 }
djv = Dejavu(config)
djv.fingerprint_directory("music/path", [".mp3",".m4a",".wav"], 3)

音乐识别#

from dejavu.logic.recognizer.file_recognizer import FileRecognizer
song = djv.recognize(FileRecognizer, "music/path/YQJO.mp3")
res=song['results']
song_id=res[0]['song_id']
song_name=res[0]['song_name'].decode('utf-8')
print(f'''{song_id} -- {song_name}''')

数据结构#

系统会创建2张表:songsfingerprints,系统扫描到重复音乐时,会打印重复信息,但是并没有记录到数据库表中。而songs表也没有文件位置信息。 不便于筛选重复音乐文件。所以,增加了song_file_info,以记录文件信息。

增加音乐信息#

diff --git a/dejavu/__init__.py b/dejavu/__init__.py
index fac72bc..689c541 100755
--- a/dejavu/__init__.py
+++ b/dejavu/__init__.py
@@ -84,8 +84,12 @@ class Dejavu:
         filenames_to_fingerprint = []
         for filename, _ in decoder.find_files(path, extensions):
             # don't refingerprint already fingerprinted files
-            if decoder.unique_hash(filename) in self.songhashes_set:
+            file_hash=decoder.unique_hash(filename)
+            if file_hash in self.songhashes_set:
                 print(f"{filename} already fingerprinted, continuing...")
+                file_size = os.path.getsize(filename)
+                song_name, extension = os.path.splitext(os.path.basename(filename))
+                self.db.insert_song_file_info(song_name, file_hash,total_hashes=0,song_path=filename,song_size=file_size)
                 continue
 
             filenames_to_fingerprint.append(filename)
@@ -99,7 +103,7 @@ class Dejavu:
         # Loop till we have all of them
         while True:
             try:
-                song_name, hashes, file_hash = next(iterator)
+                song_name, hashes, file_hash,extension,filename,file_size = next(iterator)
             except multiprocessing.TimeoutError:
                 continue
             except StopIteration:
@@ -111,6 +115,8 @@ class Dejavu:
             else:
                 sid = self.db.insert_song(song_name, file_hash, len(hashes))
 
+                self.db.insert_song_file_info(song_name, file_hash,total_hashes=len(hashes),song_path=filename,song_size=file_size)
+
                 self.db.insert_hashes(sid, hashes)
                 self.db.set_song_fingerprinted(sid)
                 self.__load_fingerprinted_audio_hashes()
@@ -132,14 +138,18 @@ class Dejavu:
         # don't refingerprint already fingerprinted files
         if song_hash in self.songhashes_set:
             print(f"{song_name} already fingerprinted, continuing...")
+            file_size = os.path.getsize(file_path)
+            song_name, extension = os.path.splitext(os.path.basename(file_path))
+            self.db.insert_song_file_info(song_name, file_hash,total_hashes=0,song_path=file_path,song_size=file_size)
         else:
-            song_name, hashes, file_hash = Dejavu._fingerprint_worker(
+            song_name, hashes, file_hash,file_size = Dejavu._fingerprint_worker(
                 file_path,
                 self.limit,
                 song_name=song_name
             )
             sid = self.db.insert_song(song_name, file_hash)
 
+            self.db.insert_song_file_info(song_name, file_hash,total_hashes=len(hashes),song_path=file_path,song_size=file_size)
             self.db.insert_hashes(sid, hashes)
             self.db.set_song_fingerprinted(sid)
             self.__load_fingerprinted_audio_hashes()
@@ -238,7 +248,9 @@ class Dejavu:
 
         fingerprints, file_hash = Dejavu.get_file_fingerprints(file_name, limit, print_output=True)
 
-        return song_name, fingerprints, file_hash
+        file_size = os.path.getsize(file_name)
+
+        return song_name, fingerprints, file_hash,extension,file_name,file_size
 
     @staticmethod
     def get_file_fingerprints(file_name: str, limit: int, print_output: bool = False):
diff --git a/dejavu/base_classes/base_database.py b/dejavu/base_classes/base_database.py
index 839a72a..bbd2378 100755
--- a/dejavu/base_classes/base_database.py
+++ b/dejavu/base_classes/base_database.py
@@ -118,6 +118,19 @@ class BaseDatabase(object, metaclass=abc.ABCMeta):
         """
         pass
 
+    @abc.abstractmethod
+    def insert_song_file_info(self, song_name: str, file_hash: str, total_hashes: int,song_id: int,song_path:str,song_size:int) -> int:
+        """
+        Inserts a song name into the database, returns the new
+        identifier of the song.
+
+        :param song_name: The name of the song.
+        :param file_hash: Hash from the fingerprinted file.
+        :param total_hashes: amount of hashes to be inserted on fingerprint table.
+        :return: the inserted id.
+        """
+        pass
+
     @abc.abstractmethod
     def query(self, fingerprint: str = None) -> List[Tuple]:
         """
diff --git a/dejavu/base_classes/common_database.py b/dejavu/base_classes/common_database.py
index e884285..8dcdf95 100644
--- a/dejavu/base_classes/common_database.py
+++ b/dejavu/base_classes/common_database.py
@@ -34,6 +34,7 @@ class CommonDatabase(BaseDatabase, metaclass=abc.ABCMeta):
             cur.execute(self.CREATE_SONGS_TABLE)
             cur.execute(self.CREATE_FINGERPRINTS_TABLE)
             cur.execute(self.DELETE_UNFINGERPRINTED)
+            cur.execute(self.CREATE_SONGS_FILE_INFO_TABLE)
 
     def empty(self) -> None:
         """
@@ -124,6 +125,18 @@ class CommonDatabase(BaseDatabase, metaclass=abc.ABCMeta):
         Inserts a song name into the database, returns the new
         identifier of the song.
 
+        :param song_name: The name of the song.
+        :param file_hash: Hash from the fingerprinted file.
+        :param total_hashes: amount of hashes to be inserted on fingerprint table.
+        :return: the inserted id.
+        """
+        pass
+    @abc.abstractmethod
+    def insert_song_file_info(self, song_name: str, file_hash: str, total_hashes: int,song_id: int,song_path:str,song_size:int) -> int:
+        """
+        Inserts a song name into the database, returns the new
+        identifier of the song.
+
         :param song_name: The name of the song.
         :param file_hash: Hash from the fingerprinted file.
         :param total_hashes: amount of hashes to be inserted on fingerprint table.
diff --git a/dejavu/config/settings.py b/dejavu/config/settings.py
index 0e20569..4f986ab 100644
--- a/dejavu/config/settings.py
+++ b/dejavu/config/settings.py
@@ -40,6 +40,15 @@ FIELD_FINGERPRINTED = "fingerprinted"
 FIELD_FILE_SHA1 = 'file_sha1'
 FIELD_TOTAL_HASHES = 'total_hashes'
 
+
+# TABLE SONG_FILE_INFO
+SONG_FILE_INFO_TABLENAME = "song_file_info"
+# SONG_FILE_INFO FIELDS
+FIELD_FILE_SHA1 = 'file_sha1'
+FIELD_TOTAL_HASHES = 'total_hashes'
+FIELD_FILE_PATH = 'file_path'
+FIELD_FILE_SIZE = 'file_size'
+
 # TABLE FINGERPRINTS
 FINGERPRINTS_TABLENAME = "fingerprints"
 
diff --git a/dejavu/database_handler/mysql_database.py b/dejavu/database_handler/mysql_database.py
index 1a8c506..72df2e6 100755
--- a/dejavu/database_handler/mysql_database.py
+++ b/dejavu/database_handler/mysql_database.py
@@ -7,7 +7,7 @@ from dejavu.base_classes.common_database import CommonDatabase
 from dejavu.config.settings import (FIELD_FILE_SHA1, FIELD_FINGERPRINTED,
                                     FIELD_HASH, FIELD_OFFSET, FIELD_SONG_ID,
                                     FIELD_SONGNAME, FIELD_TOTAL_HASHES,
-                                    FINGERPRINTS_TABLENAME, SONGS_TABLENAME)
+                                    FINGERPRINTS_TABLENAME, SONGS_TABLENAME,SONG_FILE_INFO_TABLENAME,FIELD_FILE_PATH,FIELD_FILE_SIZE)
 
 
 class MySQLDatabase(CommonDatabase):
@@ -43,6 +43,21 @@ class MySQLDatabase(CommonDatabase):
     ) ENGINE=INNODB;
     """
 
+    CREATE_SONGS_FILE_INFO_TABLE = f"""
+        CREATE TABLE IF NOT EXISTS `{SONG_FILE_INFO_TABLENAME}` (
+            `{FIELD_SONG_ID}` MEDIUMINT UNSIGNED NOT NULL
+        ,   `{FIELD_SONGNAME}` VARCHAR(250) NOT NULL
+        ,   `{FIELD_FILE_SHA1}` BINARY(20) NOT NULL
+        ,   `{FIELD_TOTAL_HASHES}` INT NOT NULL DEFAULT 0
+        ,   `{FIELD_FILE_PATH}` VARCHAR(250) NOT NULL
+        ,   `{FIELD_FILE_SIZE}` INT NULL
+        ,   `date_created` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP
+        ,   `date_modified` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP
+        ,   CONSTRAINT `pk_{SONG_FILE_INFO_TABLENAME}_{FIELD_SONG_ID}` PRIMARY KEY (`{FIELD_SONG_ID}`)
+        ,   CONSTRAINT `uq_{SONG_FILE_INFO_TABLENAME}_{FIELD_SONG_ID}` UNIQUE KEY (`{FIELD_SONG_ID}`)
+        ) ENGINE=INNODB;
+    """
+
     # INSERTS (IGNORES DUPLICATES)
     INSERT_FINGERPRINT = f"""
         INSERT IGNORE INTO `{FINGERPRINTS_TABLENAME}` (
@@ -57,6 +72,13 @@ class MySQLDatabase(CommonDatabase):
         VALUES (%s, UNHEX(%s), %s);
     """
 
+    INSERT_SONG_FILE_INFO = f"""
+        INSERT INTO `{SONG_FILE_INFO_TABLENAME}` (`{FIELD_SONGNAME}`,`{FIELD_FILE_SHA1}`,`{FIELD_TOTAL_HASHES}`,
+         `{FIELD_FILE_PATH}`,`{FIELD_FILE_SIZE}`)
+        VALUES (%s, UNHEX(%s), %s,%s,%s);
+    """
+
+
     # SELECTS
     SELECT = f"""
         SELECT `{FIELD_SONG_ID}`, `{FIELD_OFFSET}`
@@ -142,6 +164,12 @@ class MySQLDatabase(CommonDatabase):
             cur.execute(self.INSERT_SONG, (song_name, file_hash, total_hashes))
             return cur.lastrowid
 
+
+    def insert_song_file_info(self, song_name: str, file_hash: str, total_hashes: int,song_path:str,song_size:int) -> int:
+         with self.cursor() as cur:
+            cur.execute(self.INSERT_SONG_FILE_INFO, (song_name, file_hash, total_hashes,song_path,song_size))
+            return cur.lastrowid
+
     def __getstate__(self):
         return self._options,

筛选重复音乐语句#

select s.song_id,s.song_name,f.song_name,f.file_path,f.file_size,s.total_hashes
from dejavu.songs s left join dejavu.song_file_info f on s.file_sha1=f.file_sha1
where s.song_id in 
(
select t.song_id from (select s.song_id,count(s.song_id)
from dejavu.songs s left join dejavu.song_file_info f on s.file_sha1=f.file_sha1
group by s.song_id
having count(*)>1)t
) 
order by s.file_sha1;

修改文件名或者标题#

检查否乱码#

is_garbled.py

# -*- coding: utf-8 -*-
import re
import argparse
parser = argparse.ArgumentParser(description='manual to this script')
parser.add_argument("--text", type=str, default='')
args = parser.parse_args()

def is_garbled(text):
    pattern = re.compile(r'[^\u0021-\u007E \u4E00-\u9FFF]+')
    if pattern.search(text):
        return True
    return False

def main():
    text=args.text
    result=is_garbled(text)
    print(result)
    return result

if __name__ == '__main__':
    main()

修改文件标题或者文件名#

#!/bin/bash

help()
{
cat<<HELP
-p path -py is_garbled.py path 
-h show this help
HELP
exit 0
}

[ -z $1 ] && { echo -h for help; exit; };
while [ -n $1 ]; do
case $1 in
-h) help; break;;
-p) path=$2;shift 2;;
-py) py_path=$2;shift 2;;
-*) echo Error: NO Such Option. -h for help. break;;
*)break;;
esac
done


[ -z $path ] && [ -z $py_path ] && { echo -h for help; exit; };
filelist=`ls $path` 
old_ifs=$IFS
IFS=$'\n';
for file in $filelist
do
  if [ -f $path"/"$file ];then
      title_str=`ffprobe -v quiet -show_format   -print_format json $path"/"$file |grep title` ;
      IFS=':' ;
      array=(${title_str});
      title=`echo ${array[1]}|tr -d '"'|tr -d ','` ;
      title="${title#"${title%%[![:space:]]*}"}";
      if [ $title ]; then
          is_garbled=$(python3 $py_path/is_garbled.py --text $title);
      else
          is_garbled="Empty";
      fi
      
      if [ $is_garbled = 'True' ] ;then
          echo "garbled: $file" ;
          # Extract the file name without the extension
          new_title="${file%.*}" ;
          echo "new_title: $new_title" ;
          ffmpeg -i "$path/$file" -c copy -metadata title=$new_title "$path/new/$file" ;
          #echo "$path/$file" -metadata title=$new_title "$path/new/$file";
      elif [ $is_garbled = 'False' ] ;then
          echo "not garbled: $file" ;
          file_name=${file%.*}
          length=${#file_name}
          len_title=${#title}
          ext_name=${file:$length}
          if [ $length -eq 4 ] && [[ $file_name =~ [A-Z]{4} ]] && [ $len_title -gt 0 ] ;then
          #if [ $length -eq 4 ] && [ echo "$file_name" | grep -qE '^[A-Z]{4}$' ] && [ $len_title -gt 0 ] ;then
            new_file_name="${title}${ext_name}" ;
            mv  "$path/$file" "$path/new/$new_file_name" ;
            echo "rename file_name: $file_name to new_file_name: $new_file_name"  ;
            
          fi
      elif [ $is_garbled = 'Empty' ] ;then
            echo "--------Empty Title: $path/$file ---------" ;
      fi
  fi

done
IFS=$old_ifs