Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Commit 
							
							·
						
						c82bb46
	
1
								Parent(s):
							
							07e9a8a
								
Upload 18 files
Browse files- CppDataProcess/F0Preprocess.cpp +153 -0
 - CppDataProcess/F0Preprocess.hpp +36 -0
 - CppDataProcess/Slicer.hpp +82 -0
 - CppDataProcess/Wav.cpp +151 -0
 - CppDataProcess/Wav.hpp +99 -0
 - CppDataProcess/readme.md +8 -0
 - cluster/__init__.py +29 -0
 - cluster/__pycache__/__init__.cpython-38.pyc +0 -0
 - cluster/__pycache__/kmeans.cpython-38.pyc +0 -0
 - cluster/km_train.py +80 -0
 - cluster/kmeans.py +204 -0
 - cluster/train_cluster.py +85 -0
 - configs/config.json +94 -0
 - configs/diffusion.yaml +48 -0
 - configs_template/config_template.json +77 -0
 - configs_template/config_tiny_template.json +77 -0
 - configs_template/diffusion_template.yaml +51 -0
 - dataset_raw/wav_structure.txt +20 -0
 
    	
        CppDataProcess/F0Preprocess.cpp
    ADDED
    
    | 
         @@ -0,0 +1,153 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            #include "F0Preprocess.hpp"
         
     | 
| 2 | 
         
            +
             
     | 
| 3 | 
         
            +
             
     | 
| 4 | 
         
            +
            void F0PreProcess::compute_f0(const double* audio, int64_t len)
         
     | 
| 5 | 
         
            +
            {
         
     | 
| 6 | 
         
            +
            	DioOption Doption;
         
     | 
| 7 | 
         
            +
            	InitializeDioOption(&Doption);
         
     | 
| 8 | 
         
            +
            	Doption.f0_ceil = 800;
         
     | 
| 9 | 
         
            +
            	Doption.frame_period = 1000.0 * hop / fs;
         
     | 
| 10 | 
         
            +
            	f0Len = GetSamplesForDIO(fs, (int)len, Doption.frame_period);
         
     | 
| 11 | 
         
            +
            	const auto tp = new double[f0Len];
         
     | 
| 12 | 
         
            +
            	const auto tmpf0 = new double[f0Len];
         
     | 
| 13 | 
         
            +
            	rf0 = new double[f0Len];
         
     | 
| 14 | 
         
            +
            	Dio(audio, (int)len, fs, &Doption, tp, tmpf0);
         
     | 
| 15 | 
         
            +
            	StoneMask(audio, (int)len, fs, tp, tmpf0, (int)f0Len, rf0);
         
     | 
| 16 | 
         
            +
            	delete[] tmpf0;
         
     | 
| 17 | 
         
            +
            	delete[] tp;
         
     | 
| 18 | 
         
            +
            }
         
     | 
| 19 | 
         
            +
             
     | 
| 20 | 
         
            +
            std::vector<double> arange(double start,double end,double step = 1.0,double div = 1.0)
         
     | 
| 21 | 
         
            +
            {
         
     | 
| 22 | 
         
            +
            	std::vector<double> output;
         
     | 
| 23 | 
         
            +
            	while(start<end)
         
     | 
| 24 | 
         
            +
            	{
         
     | 
| 25 | 
         
            +
            		output.push_back(start / div);
         
     | 
| 26 | 
         
            +
            		start += step;
         
     | 
| 27 | 
         
            +
            	}
         
     | 
| 28 | 
         
            +
            	return output;
         
     | 
| 29 | 
         
            +
            }
         
     | 
| 30 | 
         
            +
             
     | 
| 31 | 
         
            +
            void F0PreProcess::InterPf0(int64_t len)
         
     | 
| 32 | 
         
            +
            {
         
     | 
| 33 | 
         
            +
            	const auto xi = arange(0.0, (double)f0Len * (double)len, (double)f0Len, (double)len);
         
     | 
| 34 | 
         
            +
            	const auto tmp = new double[xi.size() + 1];
         
     | 
| 35 | 
         
            +
            	interp1(arange(0, (double)f0Len).data(), rf0, static_cast<int>(f0Len), xi.data(), (int)xi.size(), tmp);
         
     | 
| 36 | 
         
            +
            	for (size_t i = 0; i < xi.size(); i++)
         
     | 
| 37 | 
         
            +
            		if (isnan(tmp[i]))
         
     | 
| 38 | 
         
            +
            			tmp[i] = 0.0;
         
     | 
| 39 | 
         
            +
            	delete[] rf0;
         
     | 
| 40 | 
         
            +
                rf0 = nullptr;
         
     | 
| 41 | 
         
            +
            	rf0 = tmp;
         
     | 
| 42 | 
         
            +
            	f0Len = (int64_t)xi.size();
         
     | 
| 43 | 
         
            +
            }
         
     | 
| 44 | 
         
            +
             
     | 
| 45 | 
         
            +
            long long* F0PreProcess::f0Log()
         
     | 
| 46 | 
         
            +
            {
         
     | 
| 47 | 
         
            +
            	const auto tmp = new long long[f0Len];
         
     | 
| 48 | 
         
            +
            	const auto f0_mel = new double[f0Len];
         
     | 
| 49 | 
         
            +
            	for (long long i = 0; i < f0Len; i++)
         
     | 
| 50 | 
         
            +
            	{
         
     | 
| 51 | 
         
            +
            		f0_mel[i] = 1127 * log(1.0 + rf0[i] / 700.0);
         
     | 
| 52 | 
         
            +
            		if (f0_mel[i] > 0.0)
         
     | 
| 53 | 
         
            +
            			f0_mel[i] = (f0_mel[i] - f0_mel_min) * (f0_bin - 2.0) / (f0_mel_max - f0_mel_min) + 1.0;
         
     | 
| 54 | 
         
            +
            		if (f0_mel[i] < 1.0)
         
     | 
| 55 | 
         
            +
            			f0_mel[i] = 1;
         
     | 
| 56 | 
         
            +
            		if (f0_mel[i] > f0_bin - 1)
         
     | 
| 57 | 
         
            +
            			f0_mel[i] = f0_bin - 1;
         
     | 
| 58 | 
         
            +
            		tmp[i] = (long long)round(f0_mel[i]);
         
     | 
| 59 | 
         
            +
            	}
         
     | 
| 60 | 
         
            +
            	delete[] f0_mel;
         
     | 
| 61 | 
         
            +
            	delete[] rf0;
         
     | 
| 62 | 
         
            +
                rf0 = nullptr;
         
     | 
| 63 | 
         
            +
            	return tmp;
         
     | 
| 64 | 
         
            +
            }
         
     | 
| 65 | 
         
            +
             
     | 
| 66 | 
         
            +
            std::vector<long long> F0PreProcess::GetF0AndOtherInput(const double* audio, int64_t audioLen, int64_t hubLen, int64_t tran)
         
     | 
| 67 | 
         
            +
            {
         
     | 
| 68 | 
         
            +
            	compute_f0(audio, audioLen);
         
     | 
| 69 | 
         
            +
            	for (int64_t i = 0; i < f0Len; ++i)
         
     | 
| 70 | 
         
            +
            	{
         
     | 
| 71 | 
         
            +
            		rf0[i] = rf0[i] * pow(2.0, static_cast<double>(tran) / 12.0);
         
     | 
| 72 | 
         
            +
            		if (rf0[i] < 0.001)
         
     | 
| 73 | 
         
            +
            			rf0[i] = NAN;
         
     | 
| 74 | 
         
            +
            	}
         
     | 
| 75 | 
         
            +
            	InterPf0(hubLen);
         
     | 
| 76 | 
         
            +
            	const auto O0f = f0Log();
         
     | 
| 77 | 
         
            +
            	std::vector<long long> Of0(O0f, O0f + f0Len);
         
     | 
| 78 | 
         
            +
                delete[] O0f;
         
     | 
| 79 | 
         
            +
            	return Of0;
         
     | 
| 80 | 
         
            +
            }
         
     | 
| 81 | 
         
            +
             
     | 
| 82 | 
         
            +
            std::vector<long long> getAligments(size_t specLen, size_t hubertLen)
         
     | 
| 83 | 
         
            +
            {
         
     | 
| 84 | 
         
            +
            	std::vector<long long> mel2ph(specLen + 1, 0);
         
     | 
| 85 | 
         
            +
             
     | 
| 86 | 
         
            +
            	size_t startFrame = 0;
         
     | 
| 87 | 
         
            +
            	const double ph_durs = static_cast<double>(specLen) / static_cast<double>(hubertLen);
         
     | 
| 88 | 
         
            +
            	for (size_t iph = 0; iph < hubertLen; ++iph)
         
     | 
| 89 | 
         
            +
            	{
         
     | 
| 90 | 
         
            +
            		const auto endFrame = static_cast<size_t>(round(static_cast<double>(iph) * ph_durs + ph_durs));
         
     | 
| 91 | 
         
            +
            		for (auto j = startFrame; j < endFrame + 1; ++j)
         
     | 
| 92 | 
         
            +
            			mel2ph[j] = static_cast<long long>(iph) + 1;
         
     | 
| 93 | 
         
            +
            		startFrame = endFrame + 1;
         
     | 
| 94 | 
         
            +
            	}
         
     | 
| 95 | 
         
            +
             
     | 
| 96 | 
         
            +
            	return mel2ph;
         
     | 
| 97 | 
         
            +
            }
         
     | 
| 98 | 
         
            +
             
     | 
| 99 | 
         
            +
            std::vector<float> F0PreProcess::GetF0AndOtherInputF0(const double* audio, int64_t audioLen, int64_t tran)
         
     | 
| 100 | 
         
            +
            {
         
     | 
| 101 | 
         
            +
            	compute_f0(audio, audioLen);
         
     | 
| 102 | 
         
            +
            	for (int64_t i = 0; i < f0Len; ++i)
         
     | 
| 103 | 
         
            +
            	{
         
     | 
| 104 | 
         
            +
            		rf0[i] = log2(rf0[i] * pow(2.0, static_cast<double>(tran) / 12.0));
         
     | 
| 105 | 
         
            +
            		if (rf0[i] < 0.001)
         
     | 
| 106 | 
         
            +
            			rf0[i] = NAN;
         
     | 
| 107 | 
         
            +
            	}
         
     | 
| 108 | 
         
            +
            	const int64_t specLen = audioLen / hop;
         
     | 
| 109 | 
         
            +
            	InterPf0(specLen);
         
     | 
| 110 | 
         
            +
             
     | 
| 111 | 
         
            +
                std::vector<float> Of0(specLen, 0.0);
         
     | 
| 112 | 
         
            +
             
     | 
| 113 | 
         
            +
                double last_value = 0.0;
         
     | 
| 114 | 
         
            +
                for (int64_t i = 0; i < specLen; ++i)
         
     | 
| 115 | 
         
            +
                {
         
     | 
| 116 | 
         
            +
                    if (rf0[i] <= 0.0)
         
     | 
| 117 | 
         
            +
                    {
         
     | 
| 118 | 
         
            +
                        int64_t j = i + 1;
         
     | 
| 119 | 
         
            +
                        for (; j < specLen; ++j)
         
     | 
| 120 | 
         
            +
                        {
         
     | 
| 121 | 
         
            +
                            if (rf0[j] > 0.0)
         
     | 
| 122 | 
         
            +
                                break;
         
     | 
| 123 | 
         
            +
                        }
         
     | 
| 124 | 
         
            +
                        if (j < specLen - 1)
         
     | 
| 125 | 
         
            +
                        {
         
     | 
| 126 | 
         
            +
                            if (last_value > 0.0)
         
     | 
| 127 | 
         
            +
                            {
         
     | 
| 128 | 
         
            +
                                const auto step = (rf0[j] - rf0[i - 1]) / double(j - i);
         
     | 
| 129 | 
         
            +
                                for (int64_t k = i; k < j; ++k)
         
     | 
| 130 | 
         
            +
                                    Of0[k] = float(rf0[i - 1] + step * double(k - i + 1));
         
     | 
| 131 | 
         
            +
                            }
         
     | 
| 132 | 
         
            +
                            else
         
     | 
| 133 | 
         
            +
                                for (int64_t k = i; k < j; ++k)
         
     | 
| 134 | 
         
            +
                                    Of0[k] = float(rf0[j]);
         
     | 
| 135 | 
         
            +
                            i = j;
         
     | 
| 136 | 
         
            +
                        }
         
     | 
| 137 | 
         
            +
                        else
         
     | 
| 138 | 
         
            +
                        {
         
     | 
| 139 | 
         
            +
                            for (int64_t k = i; k < specLen; ++k)
         
     | 
| 140 | 
         
            +
                                Of0[k] = float(last_value);
         
     | 
| 141 | 
         
            +
                            i = specLen;
         
     | 
| 142 | 
         
            +
                        }
         
     | 
| 143 | 
         
            +
                    }
         
     | 
| 144 | 
         
            +
                    else
         
     | 
| 145 | 
         
            +
                    {
         
     | 
| 146 | 
         
            +
                        Of0[i] = float(rf0[i - 1]);
         
     | 
| 147 | 
         
            +
                        last_value = rf0[i];
         
     | 
| 148 | 
         
            +
                    }
         
     | 
| 149 | 
         
            +
                }
         
     | 
| 150 | 
         
            +
                delete[] rf0;
         
     | 
| 151 | 
         
            +
                rf0 = nullptr;
         
     | 
| 152 | 
         
            +
            	return Of0;
         
     | 
| 153 | 
         
            +
            }
         
     | 
    	
        CppDataProcess/F0Preprocess.hpp
    ADDED
    
    | 
         @@ -0,0 +1,36 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            #include "world/dio.h"
         
     | 
| 2 | 
         
            +
            #include "world/stonemask.h"
         
     | 
| 3 | 
         
            +
            #include "world/matlabfunctions.h"
         
     | 
| 4 | 
         
            +
            #include <string>
         
     | 
| 5 | 
         
            +
            #include <vector>
         
     | 
| 6 | 
         
            +
             
     | 
| 7 | 
         
            +
            //Cpp F0 Preprocess
         
     | 
| 8 | 
         
            +
             
     | 
| 9 | 
         
            +
            class F0PreProcess
         
     | 
| 10 | 
         
            +
            {
         
     | 
| 11 | 
         
            +
            public:
         
     | 
| 12 | 
         
            +
            	int fs;
         
     | 
| 13 | 
         
            +
            	short hop;
         
     | 
| 14 | 
         
            +
            	const int f0_bin = 256;
         
     | 
| 15 | 
         
            +
            	const double f0_max = 1100.0;
         
     | 
| 16 | 
         
            +
            	const double f0_min = 50.0;
         
     | 
| 17 | 
         
            +
            	const double f0_mel_min = 1127.0 * log(1.0 + f0_min / 700.0);
         
     | 
| 18 | 
         
            +
            	const double f0_mel_max = 1127.0 * log(1.0 + f0_max / 700.0);
         
     | 
| 19 | 
         
            +
            	F0PreProcess(int sr = 16000, short h = 160) :fs(sr), hop(h) {}
         
     | 
| 20 | 
         
            +
            	~F0PreProcess()
         
     | 
| 21 | 
         
            +
            	{
         
     | 
| 22 | 
         
            +
            		delete[] rf0;
         
     | 
| 23 | 
         
            +
            		rf0 = nullptr;
         
     | 
| 24 | 
         
            +
            	}
         
     | 
| 25 | 
         
            +
            	void compute_f0(const double* audio, int64_t len);
         
     | 
| 26 | 
         
            +
            	void InterPf0(int64_t len);
         
     | 
| 27 | 
         
            +
            	long long* f0Log();
         
     | 
| 28 | 
         
            +
            	int64_t getLen()const { return f0Len; }
         
     | 
| 29 | 
         
            +
            	std::vector<long long> GetF0AndOtherInput(const double* audio, int64_t audioLen, int64_t hubLen, int64_t tran);
         
     | 
| 30 | 
         
            +
            	std::vector<float> GetF0AndOtherInputF0(const double* audio, int64_t audioLen, int64_t tran);
         
     | 
| 31 | 
         
            +
            private:
         
     | 
| 32 | 
         
            +
            	double* rf0 = nullptr;
         
     | 
| 33 | 
         
            +
            	int64_t f0Len = 0;
         
     | 
| 34 | 
         
            +
            };
         
     | 
| 35 | 
         
            +
             
     | 
| 36 | 
         
            +
            std::vector<long long> getAligments(size_t specLen, size_t hubertLen);
         
     | 
    	
        CppDataProcess/Slicer.hpp
    ADDED
    
    | 
         @@ -0,0 +1,82 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            #include <string>
         
     | 
| 2 | 
         
            +
            #include <vector>
         
     | 
| 3 | 
         
            +
            #include "Wav.hpp"
         
     | 
| 4 | 
         
            +
             
     | 
| 5 | 
         
            +
            struct SliceResult
         
     | 
| 6 | 
         
            +
            {
         
     | 
| 7 | 
         
            +
            	std::vector<unsigned long long>	SliceOffset;
         
     | 
| 8 | 
         
            +
            	std::vector<bool> SliceTag;
         
     | 
| 9 | 
         
            +
            	cutResult(std::vector<unsigned long long>&& O, std::vector<bool>&& T) :SliceOffset(O), SliceTag(T) {}
         
     | 
| 10 | 
         
            +
            };
         
     | 
| 11 | 
         
            +
             
     | 
| 12 | 
         
            +
            double getAvg(const short* start, const short* end)
         
     | 
| 13 | 
         
            +
            {
         
     | 
| 14 | 
         
            +
            	const auto size = end - start + 1;
         
     | 
| 15 | 
         
            +
            	auto avg = (double)(*start);
         
     | 
| 16 | 
         
            +
            	for (auto i = 1; i < size; i++)
         
     | 
| 17 | 
         
            +
            	{
         
     | 
| 18 | 
         
            +
            		avg = avg + (abs((double)start[i]) - avg) / (double)(i + 1ull);
         
     | 
| 19 | 
         
            +
            	}
         
     | 
| 20 | 
         
            +
            	return avg;
         
     | 
| 21 | 
         
            +
            }
         
     | 
| 22 | 
         
            +
             
     | 
| 23 | 
         
            +
            inline SliceResult SliceWav(Wav& input, double threshold, unsigned long minLen, unsigned short frame_len, unsigned short frame_shift)
         
     | 
| 24 | 
         
            +
            {
         
     | 
| 25 | 
         
            +
            	const auto header = input.getHeader();
         
     | 
| 26 | 
         
            +
            	if (header.Subchunk2Size < minLen * header.bytesPerSec)
         
     | 
| 27 | 
         
            +
            		return { {0,header.Subchunk2Size},{true} };
         
     | 
| 28 | 
         
            +
            	auto ptr = input.getData();
         
     | 
| 29 | 
         
            +
            	std::vector<unsigned long long> output;
         
     | 
| 30 | 
         
            +
            	std::vector<bool> tag;
         
     | 
| 31 | 
         
            +
            	auto n = (header.Subchunk2Size / frame_shift) - 2 * (frame_len / frame_shift);
         
     | 
| 32 | 
         
            +
            	unsigned long nn = 0;
         
     | 
| 33 | 
         
            +
            	bool cutTag = true;
         
     | 
| 34 | 
         
            +
            	output.emplace_back(0);
         
     | 
| 35 | 
         
            +
            	while (n--)
         
     | 
| 36 | 
         
            +
            	{
         
     | 
| 37 | 
         
            +
            		//if (nn > minLen * header.bytesPerSec)
         
     | 
| 38 | 
         
            +
            		if (cutTag)
         
     | 
| 39 | 
         
            +
            		{
         
     | 
| 40 | 
         
            +
            			const auto vol = abs(getAvg((short*)ptr, (short*)ptr + frame_len));
         
     | 
| 41 | 
         
            +
            			if (vol < threshold)
         
     | 
| 42 | 
         
            +
            			{
         
     | 
| 43 | 
         
            +
            				cutTag = false;
         
     | 
| 44 | 
         
            +
            				if (nn > minLen * header.bytesPerSec)
         
     | 
| 45 | 
         
            +
            				{
         
     | 
| 46 | 
         
            +
            					nn = 0;
         
     | 
| 47 | 
         
            +
            					output.emplace_back((ptr - input.getData()) + (frame_len / 2));
         
     | 
| 48 | 
         
            +
            				}
         
     | 
| 49 | 
         
            +
            			}
         
     | 
| 50 | 
         
            +
            			else
         
     | 
| 51 | 
         
            +
            			{
         
     | 
| 52 | 
         
            +
            				cutTag = true;
         
     | 
| 53 | 
         
            +
            			}
         
     | 
| 54 | 
         
            +
            		}
         
     | 
| 55 | 
         
            +
            		else
         
     | 
| 56 | 
         
            +
            		{
         
     | 
| 57 | 
         
            +
            			const auto vol = abs(getAvg((short*)ptr, (short*)ptr + frame_len));
         
     | 
| 58 | 
         
            +
            			if (vol < threshold)
         
     | 
| 59 | 
         
            +
            			{
         
     | 
| 60 | 
         
            +
            				cutTag = false;
         
     | 
| 61 | 
         
            +
            			}
         
     | 
| 62 | 
         
            +
            			else
         
     | 
| 63 | 
         
            +
            			{
         
     | 
| 64 | 
         
            +
            				cutTag = true;
         
     | 
| 65 | 
         
            +
            				if (nn > minLen * header.bytesPerSec)
         
     | 
| 66 | 
         
            +
            				{
         
     | 
| 67 | 
         
            +
            					nn = 0;
         
     | 
| 68 | 
         
            +
            					output.emplace_back((ptr - input.getData()) + (frame_len / 2));
         
     | 
| 69 | 
         
            +
            				}
         
     | 
| 70 | 
         
            +
            			}
         
     | 
| 71 | 
         
            +
            		}
         
     | 
| 72 | 
         
            +
            		nn += frame_shift;
         
     | 
| 73 | 
         
            +
            		ptr += frame_shift;
         
     | 
| 74 | 
         
            +
            	}
         
     | 
| 75 | 
         
            +
            	output.push_back(header.Subchunk2Size);
         
     | 
| 76 | 
         
            +
            	for (size_t i = 1; i < output.size(); i++)
         
     | 
| 77 | 
         
            +
            	{
         
     | 
| 78 | 
         
            +
            		tag.push_back(abs(getAvg((short*)(input.getData() + output[i - 1]), (short*)(input.getData() + output[i]))) > threshold);
         
     | 
| 79 | 
         
            +
            	}
         
     | 
| 80 | 
         
            +
            	return { std::move(output),std::move(tag) };
         
     | 
| 81 | 
         
            +
            }
         
     | 
| 82 | 
         
            +
             
     | 
    	
        CppDataProcess/Wav.cpp
    ADDED
    
    | 
         @@ -0,0 +1,151 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            #include "Wav.hpp"
         
     | 
| 2 | 
         
            +
             
     | 
| 3 | 
         
            +
            Wav::Wav(const wchar_t* Path) :header(WAV_HEADER()) {
         
     | 
| 4 | 
         
            +
            	char buf[1024];
         
     | 
| 5 | 
         
            +
            	FILE* stream;
         
     | 
| 6 | 
         
            +
            	_wfreopen_s(&stream, Path, L"rb", stderr);
         
     | 
| 7 | 
         
            +
            	if (stream == nullptr) {
         
     | 
| 8 | 
         
            +
            		throw (std::exception("File not exists"));
         
     | 
| 9 | 
         
            +
            	}
         
     | 
| 10 | 
         
            +
            	fread(buf, 1, HEAD_LENGTH, stream);
         
     | 
| 11 | 
         
            +
            	int pos = 0;
         
     | 
| 12 | 
         
            +
            	while (pos < HEAD_LENGTH) {
         
     | 
| 13 | 
         
            +
            		if ((buf[pos] == 'R') && (buf[pos + 1] == 'I') && (buf[pos + 2] == 'F') && (buf[pos + 3] == 'F')) {
         
     | 
| 14 | 
         
            +
            			pos += 4;
         
     | 
| 15 | 
         
            +
            			break;
         
     | 
| 16 | 
         
            +
            		}
         
     | 
| 17 | 
         
            +
            		++pos;
         
     | 
| 18 | 
         
            +
            	}
         
     | 
| 19 | 
         
            +
            	if (pos >= HEAD_LENGTH)
         
     | 
| 20 | 
         
            +
            		throw (std::exception("Don't order fried rice (annoyed)"));
         
     | 
| 21 | 
         
            +
            	header.ChunkSize = *(int*)&buf[pos];
         
     | 
| 22 | 
         
            +
            	pos += 8;
         
     | 
| 23 | 
         
            +
            	while (pos < HEAD_LENGTH) {
         
     | 
| 24 | 
         
            +
            		if ((buf[pos] == 'f') && (buf[pos + 1] == 'm') && (buf[pos + 2] == 't')) {
         
     | 
| 25 | 
         
            +
            			pos += 4;
         
     | 
| 26 | 
         
            +
            			break;
         
     | 
| 27 | 
         
            +
            		}
         
     | 
| 28 | 
         
            +
            		++pos;
         
     | 
| 29 | 
         
            +
            	}
         
     | 
| 30 | 
         
            +
            	if (pos >= HEAD_LENGTH)
         
     | 
| 31 | 
         
            +
            		throw (std::exception("Don't order fried rice (annoyed)"));
         
     | 
| 32 | 
         
            +
            	header.Subchunk1Size = *(int*)&buf[pos];
         
     | 
| 33 | 
         
            +
            	pos += 4;
         
     | 
| 34 | 
         
            +
            	header.AudioFormat = *(short*)&buf[pos];
         
     | 
| 35 | 
         
            +
            	pos += 2;
         
     | 
| 36 | 
         
            +
            	header.NumOfChan = *(short*)&buf[pos];
         
     | 
| 37 | 
         
            +
            	pos += 2;
         
     | 
| 38 | 
         
            +
            	header.SamplesPerSec = *(int*)&buf[pos];
         
     | 
| 39 | 
         
            +
            	pos += 4;
         
     | 
| 40 | 
         
            +
            	header.bytesPerSec = *(int*)&buf[pos];
         
     | 
| 41 | 
         
            +
            	pos += 4;
         
     | 
| 42 | 
         
            +
            	header.blockAlign = *(short*)&buf[pos];
         
     | 
| 43 | 
         
            +
            	pos += 2;
         
     | 
| 44 | 
         
            +
            	header.bitsPerSample = *(short*)&buf[pos];
         
     | 
| 45 | 
         
            +
            	pos += 2;
         
     | 
| 46 | 
         
            +
            	while (pos < HEAD_LENGTH) {
         
     | 
| 47 | 
         
            +
            		if ((buf[pos] == 'd') && (buf[pos + 1] == 'a') && (buf[pos + 2] == 't') && (buf[pos + 3] == 'a')) {
         
     | 
| 48 | 
         
            +
            			pos += 4;
         
     | 
| 49 | 
         
            +
            			break;
         
     | 
| 50 | 
         
            +
            		}
         
     | 
| 51 | 
         
            +
            		++pos;
         
     | 
| 52 | 
         
            +
            	}
         
     | 
| 53 | 
         
            +
            	if (pos >= HEAD_LENGTH)
         
     | 
| 54 | 
         
            +
            		throw (std::exception("Don't order fried rice (annoyed)"));
         
     | 
| 55 | 
         
            +
            	header.Subchunk2Size = *(int*)&buf[pos];
         
     | 
| 56 | 
         
            +
            	pos += 4;
         
     | 
| 57 | 
         
            +
            	StartPos = pos;
         
     | 
| 58 | 
         
            +
            	Data = new char[header.Subchunk2Size + 1];
         
     | 
| 59 | 
         
            +
            	fseek(stream, StartPos, SEEK_SET);
         
     | 
| 60 | 
         
            +
            	fread(Data, 1, header.Subchunk2Size, stream);
         
     | 
| 61 | 
         
            +
            	if (stream != nullptr) {
         
     | 
| 62 | 
         
            +
            		fclose(stream);
         
     | 
| 63 | 
         
            +
            	}
         
     | 
| 64 | 
         
            +
            	SData = reinterpret_cast<int16_t*>(Data);
         
     | 
| 65 | 
         
            +
            	dataSize = header.Subchunk2Size / 2;
         
     | 
| 66 | 
         
            +
            }
         
     | 
| 67 | 
         
            +
             
     | 
| 68 | 
         
            +
            Wav::Wav(const Wav& input) :header(WAV_HEADER()) {
         
     | 
| 69 | 
         
            +
            	Data = new char[(input.header.Subchunk2Size + 1)];
         
     | 
| 70 | 
         
            +
            	if (Data == nullptr) { throw std::exception("OOM"); }
         
     | 
| 71 | 
         
            +
            	memcpy(header.RIFF, input.header.RIFF, 4);
         
     | 
| 72 | 
         
            +
            	memcpy(header.fmt, input.header.fmt, 4);
         
     | 
| 73 | 
         
            +
            	memcpy(header.WAVE, input.header.WAVE, 4);
         
     | 
| 74 | 
         
            +
            	memcpy(header.Subchunk2ID, input.header.Subchunk2ID, 4);
         
     | 
| 75 | 
         
            +
            	header.ChunkSize = input.header.ChunkSize;
         
     | 
| 76 | 
         
            +
            	header.Subchunk1Size = input.header.Subchunk1Size;
         
     | 
| 77 | 
         
            +
            	header.AudioFormat = input.header.AudioFormat;
         
     | 
| 78 | 
         
            +
            	header.NumOfChan = input.header.NumOfChan;
         
     | 
| 79 | 
         
            +
            	header.SamplesPerSec = input.header.SamplesPerSec;
         
     | 
| 80 | 
         
            +
            	header.bytesPerSec = input.header.bytesPerSec;
         
     | 
| 81 | 
         
            +
            	header.blockAlign = input.header.blockAlign;
         
     | 
| 82 | 
         
            +
            	header.bitsPerSample = input.header.bitsPerSample;
         
     | 
| 83 | 
         
            +
            	header.Subchunk2Size = input.header.Subchunk2Size;
         
     | 
| 84 | 
         
            +
            	StartPos = input.StartPos;
         
     | 
| 85 | 
         
            +
            	memcpy(Data, input.Data, input.header.Subchunk2Size);
         
     | 
| 86 | 
         
            +
            	SData = reinterpret_cast<int16_t*>(Data);
         
     | 
| 87 | 
         
            +
            	dataSize = header.Subchunk2Size / 2;
         
     | 
| 88 | 
         
            +
            }
         
     | 
| 89 | 
         
            +
             
     | 
| 90 | 
         
            +
            Wav::Wav(Wav&& input) noexcept
         
     | 
| 91 | 
         
            +
            {
         
     | 
| 92 | 
         
            +
            	Data = input.Data;
         
     | 
| 93 | 
         
            +
            	input.Data = nullptr;
         
     | 
| 94 | 
         
            +
            	memcpy(header.RIFF, input.header.RIFF, 4);
         
     | 
| 95 | 
         
            +
            	memcpy(header.fmt, input.header.fmt, 4);
         
     | 
| 96 | 
         
            +
            	memcpy(header.WAVE, input.header.WAVE, 4);
         
     | 
| 97 | 
         
            +
            	memcpy(header.Subchunk2ID, input.header.Subchunk2ID, 4);
         
     | 
| 98 | 
         
            +
            	header.ChunkSize = input.header.ChunkSize;
         
     | 
| 99 | 
         
            +
            	header.Subchunk1Size = input.header.Subchunk1Size;
         
     | 
| 100 | 
         
            +
            	header.AudioFormat = input.header.AudioFormat;
         
     | 
| 101 | 
         
            +
            	header.NumOfChan = input.header.NumOfChan;
         
     | 
| 102 | 
         
            +
            	header.SamplesPerSec = input.header.SamplesPerSec;
         
     | 
| 103 | 
         
            +
            	header.bytesPerSec = input.header.bytesPerSec;
         
     | 
| 104 | 
         
            +
            	header.blockAlign = input.header.blockAlign;
         
     | 
| 105 | 
         
            +
            	header.bitsPerSample = input.header.bitsPerSample;
         
     | 
| 106 | 
         
            +
            	header.Subchunk2Size = input.header.Subchunk2Size;
         
     | 
| 107 | 
         
            +
            	StartPos = input.StartPos;
         
     | 
| 108 | 
         
            +
            	SData = reinterpret_cast<int16_t*>(Data);
         
     | 
| 109 | 
         
            +
            	dataSize = header.Subchunk2Size / 2;
         
     | 
| 110 | 
         
            +
            }
         
     | 
| 111 | 
         
            +
             
     | 
| 112 | 
         
            +
            Wav& Wav::operator=(Wav&& input) noexcept
         
     | 
| 113 | 
         
            +
            {
         
     | 
| 114 | 
         
            +
            	destory();
         
     | 
| 115 | 
         
            +
            	Data = input.Data;
         
     | 
| 116 | 
         
            +
            	input.Data = nullptr;
         
     | 
| 117 | 
         
            +
            	memcpy(header.RIFF, input.header.RIFF, 4);
         
     | 
| 118 | 
         
            +
            	memcpy(header.fmt, input.header.fmt, 4);
         
     | 
| 119 | 
         
            +
            	memcpy(header.WAVE, input.header.WAVE, 4);
         
     | 
| 120 | 
         
            +
            	memcpy(header.Subchunk2ID, input.header.Subchunk2ID, 4);
         
     | 
| 121 | 
         
            +
            	header.ChunkSize = input.header.ChunkSize;
         
     | 
| 122 | 
         
            +
            	header.Subchunk1Size = input.header.Subchunk1Size;
         
     | 
| 123 | 
         
            +
            	header.AudioFormat = input.header.AudioFormat;
         
     | 
| 124 | 
         
            +
            	header.NumOfChan = input.header.NumOfChan;
         
     | 
| 125 | 
         
            +
            	header.SamplesPerSec = input.header.SamplesPerSec;
         
     | 
| 126 | 
         
            +
            	header.bytesPerSec = input.header.bytesPerSec;
         
     | 
| 127 | 
         
            +
            	header.blockAlign = input.header.blockAlign;
         
     | 
| 128 | 
         
            +
            	header.bitsPerSample = input.header.bitsPerSample;
         
     | 
| 129 | 
         
            +
            	header.Subchunk2Size = input.header.Subchunk2Size;
         
     | 
| 130 | 
         
            +
            	StartPos = input.StartPos;
         
     | 
| 131 | 
         
            +
            	SData = reinterpret_cast<int16_t*>(Data);
         
     | 
| 132 | 
         
            +
            	dataSize = header.Subchunk2Size / 2;
         
     | 
| 133 | 
         
            +
            	return *this;
         
     | 
| 134 | 
         
            +
            }
         
     | 
| 135 | 
         
            +
             
     | 
| 136 | 
         
            +
            Wav& Wav::cat(const Wav& input)
         
     | 
| 137 | 
         
            +
            {
         
     | 
| 138 | 
         
            +
            	if (header.AudioFormat != 1) return *this;
         
     | 
| 139 | 
         
            +
            	if (header.SamplesPerSec != input.header.bitsPerSample || header.NumOfChan != input.header.NumOfChan) return *this;
         
     | 
| 140 | 
         
            +
            	char* buffer = new char[(int64_t)header.Subchunk2Size + (int64_t)input.header.Subchunk2Size + 1];
         
     | 
| 141 | 
         
            +
            	if (buffer == nullptr)return *this;
         
     | 
| 142 | 
         
            +
            	memcpy(buffer, Data, header.Subchunk2Size);
         
     | 
| 143 | 
         
            +
            	memcpy(buffer + header.Subchunk2Size, input.Data, input.header.Subchunk2Size);
         
     | 
| 144 | 
         
            +
            	header.ChunkSize += input.header.Subchunk2Size;
         
     | 
| 145 | 
         
            +
            	header.Subchunk2Size += input.header.Subchunk2Size;
         
     | 
| 146 | 
         
            +
            	delete[] Data;
         
     | 
| 147 | 
         
            +
            	Data = buffer;
         
     | 
| 148 | 
         
            +
            	SData = reinterpret_cast<int16_t*>(Data);
         
     | 
| 149 | 
         
            +
            	dataSize = header.Subchunk2Size / 2;
         
     | 
| 150 | 
         
            +
            	return *this;
         
     | 
| 151 | 
         
            +
            }
         
     | 
    	
        CppDataProcess/Wav.hpp
    ADDED
    
    | 
         @@ -0,0 +1,99 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            class Wav {
         
     | 
| 2 | 
         
            +
            public:
         
     | 
| 3 | 
         
            +
             
     | 
| 4 | 
         
            +
            	struct WAV_HEADER {
         
     | 
| 5 | 
         
            +
            		char             RIFF[4] = { 'R','I','F','F' };              //RIFF��ʶ
         
     | 
| 6 | 
         
            +
            		unsigned long    ChunkSize;                                  //�ļ���С-8
         
     | 
| 7 | 
         
            +
            		char             WAVE[4] = { 'W','A','V','E' };              //WAVE��
         
     | 
| 8 | 
         
            +
            		char             fmt[4] = { 'f','m','t',' ' };               //fmt��
         
     | 
| 9 | 
         
            +
            		unsigned long    Subchunk1Size;                              //fmt���С
         
     | 
| 10 | 
         
            +
            		unsigned short   AudioFormat;                                //�����ʽ
         
     | 
| 11 | 
         
            +
            		unsigned short   NumOfChan;                                  //������
         
     | 
| 12 | 
         
            +
            		unsigned long    SamplesPerSec;                              //������
         
     | 
| 13 | 
         
            +
            		unsigned long    bytesPerSec;                                //ÿ�����ֽ���
         
     | 
| 14 | 
         
            +
            		unsigned short   blockAlign;                                 //�������ֽ�
         
     | 
| 15 | 
         
            +
            		unsigned short   bitsPerSample;                              //�������
         
     | 
| 16 | 
         
            +
            		char             Subchunk2ID[4] = { 'd','a','t','a' };       //���ݿ�
         
     | 
| 17 | 
         
            +
            		unsigned long    Subchunk2Size;                              //���ݿ��С
         
     | 
| 18 | 
         
            +
            		WAV_HEADER(unsigned long cs = 36, unsigned long sc1s = 16, unsigned short af = 1, unsigned short nc = 1, unsigned long sr = 22050, unsigned long bps = 44100, unsigned short ba = 2, unsigned short bips = 16, unsigned long sc2s = 0) :ChunkSize(cs), Subchunk1Size(sc1s), AudioFormat(af), NumOfChan(nc), SamplesPerSec(sr), bytesPerSec(bps), blockAlign(ba), bitsPerSample(bips), Subchunk2Size(sc2s) {}
         
     | 
| 19 | 
         
            +
            	};
         
     | 
| 20 | 
         
            +
            	using iterator = int16_t*;
         
     | 
| 21 | 
         
            +
            	Wav(unsigned long cs = 36, unsigned long sc1s = 16, unsigned short af = 1, unsigned short nc = 1, unsigned long sr = 22050, unsigned long bps = 44100, unsigned short ba = 2, unsigned short bips = 16, unsigned long sc2s = 0) :header({
         
     | 
| 22 | 
         
            +
            			cs,
         
     | 
| 23 | 
         
            +
            			sc1s,
         
     | 
| 24 | 
         
            +
            			af,
         
     | 
| 25 | 
         
            +
            			nc,
         
     | 
| 26 | 
         
            +
            			sr,
         
     | 
| 27 | 
         
            +
            			bps,
         
     | 
| 28 | 
         
            +
            			ba,
         
     | 
| 29 | 
         
            +
            			bips,
         
     | 
| 30 | 
         
            +
            			sc2s
         
     | 
| 31 | 
         
            +
            		}), Data(nullptr), StartPos(44) {
         
     | 
| 32 | 
         
            +
            		dataSize = 0;
         
     | 
| 33 | 
         
            +
            		SData = nullptr;
         
     | 
| 34 | 
         
            +
            	}
         
     | 
| 35 | 
         
            +
            	Wav(unsigned long sr, unsigned long length, const void* data) :header({
         
     | 
| 36 | 
         
            +
            			36,
         
     | 
| 37 | 
         
            +
            			16,
         
     | 
| 38 | 
         
            +
            			1,
         
     | 
| 39 | 
         
            +
            			1,
         
     | 
| 40 | 
         
            +
            			sr,
         
     | 
| 41 | 
         
            +
            			sr * 2,
         
     | 
| 42 | 
         
            +
            			2,
         
     | 
| 43 | 
         
            +
            			16,
         
     | 
| 44 | 
         
            +
            			length
         
     | 
| 45 | 
         
            +
            		}), Data(new char[length + 1]), StartPos(44)
         
     | 
| 46 | 
         
            +
            	{
         
     | 
| 47 | 
         
            +
            		header.ChunkSize = 36 + length;
         
     | 
| 48 | 
         
            +
            		memcpy(Data, data, length);
         
     | 
| 49 | 
         
            +
            		SData = reinterpret_cast<int16_t*>(Data);
         
     | 
| 50 | 
         
            +
            		dataSize = length / 2;
         
     | 
| 51 | 
         
            +
            	}
         
     | 
| 52 | 
         
            +
            	Wav(const wchar_t* Path);
         
     | 
| 53 | 
         
            +
            	Wav(const Wav& input);
         
     | 
| 54 | 
         
            +
            	Wav(Wav&& input) noexcept;
         
     | 
| 55 | 
         
            +
            	Wav& operator=(const Wav& input) = delete;
         
     | 
| 56 | 
         
            +
            	Wav& operator=(Wav&& input) noexcept;
         
     | 
| 57 | 
         
            +
            	~Wav() { destory(); }
         
     | 
| 58 | 
         
            +
            	Wav& cat(const Wav& input);
         
     | 
| 59 | 
         
            +
            	bool isEmpty() const { return this->header.Subchunk2Size == 0; }
         
     | 
| 60 | 
         
            +
            	const char* getData() const { return Data; }
         
     | 
| 61 | 
         
            +
            	char* getData() { return Data; }
         
     | 
| 62 | 
         
            +
            	WAV_HEADER getHeader() const { return header; }
         
     | 
| 63 | 
         
            +
            	WAV_HEADER& Header() { return header; }
         
     | 
| 64 | 
         
            +
            	void destory() const { delete[] Data; }
         
     | 
| 65 | 
         
            +
            	void changeData(const void* indata,long length,int sr)
         
     | 
| 66 | 
         
            +
            	{
         
     | 
| 67 | 
         
            +
            		delete[] Data;
         
     | 
| 68 | 
         
            +
            		Data = new char[length];
         
     | 
| 69 | 
         
            +
            		memcpy(Data, indata, length);
         
     | 
| 70 | 
         
            +
            		header.ChunkSize = 36 + length;
         
     | 
| 71 | 
         
            +
            		header.Subchunk2Size = length;
         
     | 
| 72 | 
         
            +
            		header.SamplesPerSec = sr;
         
     | 
| 73 | 
         
            +
            		header.bytesPerSec = 2 * sr;
         
     | 
| 74 | 
         
            +
            	}
         
     | 
| 75 | 
         
            +
            	int16_t& operator[](const size_t index) const
         
     | 
| 76 | 
         
            +
            	{
         
     | 
| 77 | 
         
            +
            		if (index < dataSize)
         
     | 
| 78 | 
         
            +
            			return *(SData + index);
         
     | 
| 79 | 
         
            +
            		return *(SData + dataSize - 1);
         
     | 
| 80 | 
         
            +
            	}
         
     | 
| 81 | 
         
            +
            	iterator begin() const
         
     | 
| 82 | 
         
            +
            	{
         
     | 
| 83 | 
         
            +
            		return reinterpret_cast<int16_t*>(Data);
         
     | 
| 84 | 
         
            +
            	}
         
     | 
| 85 | 
         
            +
            	iterator end() const
         
     | 
| 86 | 
         
            +
            	{
         
     | 
| 87 | 
         
            +
            		return reinterpret_cast<int16_t*>(Data + header.Subchunk2Size);
         
     | 
| 88 | 
         
            +
            	}
         
     | 
| 89 | 
         
            +
            	int64_t getDataLen()const
         
     | 
| 90 | 
         
            +
            	{
         
     | 
| 91 | 
         
            +
            		return static_cast<int64_t>(dataSize);
         
     | 
| 92 | 
         
            +
            	}
         
     | 
| 93 | 
         
            +
            private:
         
     | 
| 94 | 
         
            +
            	WAV_HEADER header;
         
     | 
| 95 | 
         
            +
            	char* Data;
         
     | 
| 96 | 
         
            +
            	int16_t* SData;
         
     | 
| 97 | 
         
            +
            	size_t dataSize;
         
     | 
| 98 | 
         
            +
            	int StartPos;
         
     | 
| 99 | 
         
            +
            };
         
     | 
    	
        CppDataProcess/readme.md
    ADDED
    
    | 
         @@ -0,0 +1,8 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            ## F0Preprocess
         
     | 
| 2 | 
         
            +
            请前往 https://github.com/JeremyCCHsu/Python-Wrapper-for-World-Vocoder 下载PyWorld的源代码并编译出静态库并链接到你的项目之中,然后调用此头文件
         
     | 
| 3 | 
         
            +
             
     | 
| 4 | 
         
            +
            ## Slicer
         
     | 
| 5 | 
         
            +
            一个简单的切片机
         
     | 
| 6 | 
         
            +
             
     | 
| 7 | 
         
            +
            ---
         
     | 
| 8 | 
         
            +
            ~~上面的东西是直接从MoeSS的代码里面抽出来的,可以作为预置预处理的替代品()~~
         
     | 
    	
        cluster/__init__.py
    ADDED
    
    | 
         @@ -0,0 +1,29 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            import torch
         
     | 
| 2 | 
         
            +
            from sklearn.cluster import KMeans
         
     | 
| 3 | 
         
            +
             
     | 
| 4 | 
         
            +
             
     | 
| 5 | 
         
            +
            def get_cluster_model(ckpt_path):
         
     | 
| 6 | 
         
            +
                checkpoint = torch.load(ckpt_path)
         
     | 
| 7 | 
         
            +
                kmeans_dict = {}
         
     | 
| 8 | 
         
            +
                for spk, ckpt in checkpoint.items():
         
     | 
| 9 | 
         
            +
                    km = KMeans(ckpt["n_features_in_"])
         
     | 
| 10 | 
         
            +
                    km.__dict__["n_features_in_"] = ckpt["n_features_in_"]
         
     | 
| 11 | 
         
            +
                    km.__dict__["_n_threads"] = ckpt["_n_threads"]
         
     | 
| 12 | 
         
            +
                    km.__dict__["cluster_centers_"] = ckpt["cluster_centers_"]
         
     | 
| 13 | 
         
            +
                    kmeans_dict[spk] = km
         
     | 
| 14 | 
         
            +
                return kmeans_dict
         
     | 
| 15 | 
         
            +
             
     | 
| 16 | 
         
            +
            def get_cluster_result(model, x, speaker):
         
     | 
| 17 | 
         
            +
                """
         
     | 
| 18 | 
         
            +
                    x: np.array [t, 256]
         
     | 
| 19 | 
         
            +
                    return cluster class result
         
     | 
| 20 | 
         
            +
                """
         
     | 
| 21 | 
         
            +
                return model[speaker].predict(x)
         
     | 
| 22 | 
         
            +
             
     | 
| 23 | 
         
            +
            def get_cluster_center_result(model, x,speaker):
         
     | 
| 24 | 
         
            +
                """x: np.array [t, 256]"""
         
     | 
| 25 | 
         
            +
                predict = model[speaker].predict(x)
         
     | 
| 26 | 
         
            +
                return model[speaker].cluster_centers_[predict]
         
     | 
| 27 | 
         
            +
             
     | 
| 28 | 
         
            +
            def get_center(model, x,speaker):
         
     | 
| 29 | 
         
            +
                return model[speaker].cluster_centers_[x]
         
     | 
    	
        cluster/__pycache__/__init__.cpython-38.pyc
    ADDED
    
    | 
         Binary file (1.09 kB). View file 
     | 
| 
         | 
    	
        cluster/__pycache__/kmeans.cpython-38.pyc
    ADDED
    
    | 
         Binary file (6.95 kB). View file 
     | 
| 
         | 
    	
        cluster/km_train.py
    ADDED
    
    | 
         @@ -0,0 +1,80 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            import time,pdb
         
     | 
| 2 | 
         
            +
            import tqdm
         
     | 
| 3 | 
         
            +
            from time import time as ttime
         
     | 
| 4 | 
         
            +
            import os
         
     | 
| 5 | 
         
            +
            from pathlib import Path
         
     | 
| 6 | 
         
            +
            import logging
         
     | 
| 7 | 
         
            +
            import argparse
         
     | 
| 8 | 
         
            +
            from cluster.kmeans import KMeansGPU
         
     | 
| 9 | 
         
            +
            import torch
         
     | 
| 10 | 
         
            +
            import numpy as np
         
     | 
| 11 | 
         
            +
            from sklearn.cluster import KMeans,MiniBatchKMeans
         
     | 
| 12 | 
         
            +
             
     | 
| 13 | 
         
            +
            logging.basicConfig(level=logging.INFO)
         
     | 
| 14 | 
         
            +
            logger = logging.getLogger(__name__)
         
     | 
| 15 | 
         
            +
            from time import time as ttime
         
     | 
| 16 | 
         
            +
            import pynvml,torch
         
     | 
| 17 | 
         
            +
             
     | 
| 18 | 
         
            +
            def train_cluster(in_dir, n_clusters, use_minibatch=True, verbose=False,use_gpu=False):#gpu_minibatch真拉,虽然库支持但是也不考虑
         
     | 
| 19 | 
         
            +
                logger.info(f"Loading features from {in_dir}")
         
     | 
| 20 | 
         
            +
                features = []
         
     | 
| 21 | 
         
            +
                nums = 0
         
     | 
| 22 | 
         
            +
                for path in tqdm.tqdm(in_dir.glob("*.soft.pt")):
         
     | 
| 23 | 
         
            +
                # for name in os.listdir(in_dir):
         
     | 
| 24 | 
         
            +
                #     path="%s/%s"%(in_dir,name)
         
     | 
| 25 | 
         
            +
                    features.append(torch.load(path,map_location="cpu").squeeze(0).numpy().T)
         
     | 
| 26 | 
         
            +
                    # print(features[-1].shape)
         
     | 
| 27 | 
         
            +
                features = np.concatenate(features, axis=0)
         
     | 
| 28 | 
         
            +
                print(nums, features.nbytes/ 1024**2, "MB , shape:",features.shape, features.dtype)
         
     | 
| 29 | 
         
            +
                features = features.astype(np.float32)
         
     | 
| 30 | 
         
            +
                logger.info(f"Clustering features of shape: {features.shape}")
         
     | 
| 31 | 
         
            +
                t = time.time()
         
     | 
| 32 | 
         
            +
                if(use_gpu==False):
         
     | 
| 33 | 
         
            +
                    if use_minibatch:
         
     | 
| 34 | 
         
            +
                        kmeans = MiniBatchKMeans(n_clusters=n_clusters,verbose=verbose, batch_size=4096, max_iter=80).fit(features)
         
     | 
| 35 | 
         
            +
                    else:
         
     | 
| 36 | 
         
            +
                        kmeans = KMeans(n_clusters=n_clusters,verbose=verbose).fit(features)
         
     | 
| 37 | 
         
            +
                else:
         
     | 
| 38 | 
         
            +
                        kmeans = KMeansGPU(n_clusters=n_clusters, mode='euclidean', verbose=2 if verbose else 0,max_iter=500,tol=1e-2)#
         
     | 
| 39 | 
         
            +
                        features=torch.from_numpy(features)#.to(device)
         
     | 
| 40 | 
         
            +
                        labels = kmeans.fit_predict(features)#
         
     | 
| 41 | 
         
            +
             
     | 
| 42 | 
         
            +
                print(time.time()-t, "s")
         
     | 
| 43 | 
         
            +
             
     | 
| 44 | 
         
            +
                x = {
         
     | 
| 45 | 
         
            +
                        "n_features_in_": kmeans.n_features_in_ if use_gpu==False else features.shape[0],
         
     | 
| 46 | 
         
            +
                        "_n_threads": kmeans._n_threads if use_gpu==False else 4,
         
     | 
| 47 | 
         
            +
                        "cluster_centers_": kmeans.cluster_centers_ if use_gpu==False else kmeans.centroids.cpu().numpy(),
         
     | 
| 48 | 
         
            +
                }
         
     | 
| 49 | 
         
            +
                print("end")
         
     | 
| 50 | 
         
            +
             
     | 
| 51 | 
         
            +
                return x
         
     | 
| 52 | 
         
            +
             
     | 
| 53 | 
         
            +
            if __name__ == "__main__":
         
     | 
| 54 | 
         
            +
                parser = argparse.ArgumentParser()
         
     | 
| 55 | 
         
            +
                parser.add_argument('--dataset', type=Path, default="./dataset/44k",
         
     | 
| 56 | 
         
            +
                                    help='path of training data directory')
         
     | 
| 57 | 
         
            +
                parser.add_argument('--output', type=Path, default="logs/44k",
         
     | 
| 58 | 
         
            +
                                    help='path of model output directory')
         
     | 
| 59 | 
         
            +
             
     | 
| 60 | 
         
            +
                args = parser.parse_args()
         
     | 
| 61 | 
         
            +
             
     | 
| 62 | 
         
            +
                checkpoint_dir = args.output
         
     | 
| 63 | 
         
            +
                dataset = args.dataset
         
     | 
| 64 | 
         
            +
                n_clusters = 1000
         
     | 
| 65 | 
         
            +
                
         
     | 
| 66 | 
         
            +
                ckpt = {}
         
     | 
| 67 | 
         
            +
                for spk in os.listdir(dataset):
         
     | 
| 68 | 
         
            +
                    if os.path.isdir(dataset/spk):
         
     | 
| 69 | 
         
            +
                        print(f"train kmeans for {spk}...")
         
     | 
| 70 | 
         
            +
                        in_dir = dataset/spk
         
     | 
| 71 | 
         
            +
                        x = train_cluster(in_dir, n_clusters,use_minibatch=False,verbose=False,use_gpu=True)
         
     | 
| 72 | 
         
            +
                        ckpt[spk] = x
         
     | 
| 73 | 
         
            +
             
     | 
| 74 | 
         
            +
                checkpoint_path = checkpoint_dir / f"kmeans_{n_clusters}.pt"
         
     | 
| 75 | 
         
            +
                checkpoint_path.parent.mkdir(exist_ok=True, parents=True)
         
     | 
| 76 | 
         
            +
                torch.save(
         
     | 
| 77 | 
         
            +
                    ckpt,
         
     | 
| 78 | 
         
            +
                    checkpoint_path,
         
     | 
| 79 | 
         
            +
                )
         
     | 
| 80 | 
         
            +
                
         
     | 
    	
        cluster/kmeans.py
    ADDED
    
    | 
         @@ -0,0 +1,204 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            from time import time
         
     | 
| 2 | 
         
            +
             
     | 
| 3 | 
         
            +
            import numpy as np
         
     | 
| 4 | 
         
            +
            import pynvml
         
     | 
| 5 | 
         
            +
            import torch
         
     | 
| 6 | 
         
            +
            from torch.nn.functional import normalize
         
     | 
| 7 | 
         
            +
             
     | 
| 8 | 
         
            +
             
     | 
| 9 | 
         
            +
            # device=torch.device("cuda:0")
         
     | 
| 10 | 
         
            +
            def _kpp(data: torch.Tensor, k: int, sample_size: int = -1):
         
     | 
| 11 | 
         
            +
                """ Picks k points in the data based on the kmeans++ method.
         
     | 
| 12 | 
         
            +
             
     | 
| 13 | 
         
            +
                Parameters
         
     | 
| 14 | 
         
            +
                ----------
         
     | 
| 15 | 
         
            +
                data : torch.Tensor
         
     | 
| 16 | 
         
            +
                    Expect a rank 1 or 2 array. Rank 1 is assumed to describe 1-D
         
     | 
| 17 | 
         
            +
                    data, rank 2 multidimensional data, in which case one
         
     | 
| 18 | 
         
            +
                    row is one observation.
         
     | 
| 19 | 
         
            +
                k : int
         
     | 
| 20 | 
         
            +
                    Number of samples to generate.
         
     | 
| 21 | 
         
            +
                sample_size : int
         
     | 
| 22 | 
         
            +
                    sample data to avoid memory overflow during calculation
         
     | 
| 23 | 
         
            +
             
     | 
| 24 | 
         
            +
                Returns
         
     | 
| 25 | 
         
            +
                -------
         
     | 
| 26 | 
         
            +
                init : ndarray
         
     | 
| 27 | 
         
            +
                    A 'k' by 'N' containing the initial centroids.
         
     | 
| 28 | 
         
            +
             
     | 
| 29 | 
         
            +
                References
         
     | 
| 30 | 
         
            +
                ----------
         
     | 
| 31 | 
         
            +
                .. [1] D. Arthur and S. Vassilvitskii, "k-means++: the advantages of
         
     | 
| 32 | 
         
            +
                   careful seeding", Proceedings of the Eighteenth Annual ACM-SIAM Symposium
         
     | 
| 33 | 
         
            +
                   on Discrete Algorithms, 2007.
         
     | 
| 34 | 
         
            +
                .. [2] scipy/cluster/vq.py: _kpp
         
     | 
| 35 | 
         
            +
                """
         
     | 
| 36 | 
         
            +
                batch_size=data.shape[0]
         
     | 
| 37 | 
         
            +
                if batch_size>sample_size:
         
     | 
| 38 | 
         
            +
                    data = data[torch.randint(0, batch_size,[sample_size], device=data.device)]
         
     | 
| 39 | 
         
            +
                dims = data.shape[1] if len(data.shape) > 1 else 1
         
     | 
| 40 | 
         
            +
                init = torch.zeros((k, dims)).to(data.device)
         
     | 
| 41 | 
         
            +
                r = torch.distributions.uniform.Uniform(0, 1)
         
     | 
| 42 | 
         
            +
                for i in range(k):
         
     | 
| 43 | 
         
            +
                    if i == 0:
         
     | 
| 44 | 
         
            +
                        init[i, :] = data[torch.randint(data.shape[0], [1])]
         
     | 
| 45 | 
         
            +
                    else:
         
     | 
| 46 | 
         
            +
                        D2 = torch.cdist(init[:i, :][None, :], data[None, :], p=2)[0].amin(dim=0)
         
     | 
| 47 | 
         
            +
                        probs = D2 / torch.sum(D2)
         
     | 
| 48 | 
         
            +
                        cumprobs = torch.cumsum(probs, dim=0)
         
     | 
| 49 | 
         
            +
                        init[i, :] = data[torch.searchsorted(cumprobs, r.sample([1]).to(data.device))]
         
     | 
| 50 | 
         
            +
                return init
         
     | 
| 51 | 
         
            +
            class KMeansGPU:
         
     | 
| 52 | 
         
            +
              '''
         
     | 
| 53 | 
         
            +
              Kmeans clustering algorithm implemented with PyTorch
         
     | 
| 54 | 
         
            +
             
     | 
| 55 | 
         
            +
              Parameters:
         
     | 
| 56 | 
         
            +
                n_clusters: int, 
         
     | 
| 57 | 
         
            +
                  Number of clusters
         
     | 
| 58 | 
         
            +
             
     | 
| 59 | 
         
            +
                max_iter: int, default: 100
         
     | 
| 60 | 
         
            +
                  Maximum number of iterations
         
     | 
| 61 | 
         
            +
             
     | 
| 62 | 
         
            +
                tol: float, default: 0.0001
         
     | 
| 63 | 
         
            +
                  Tolerance
         
     | 
| 64 | 
         
            +
                
         
     | 
| 65 | 
         
            +
                verbose: int, default: 0
         
     | 
| 66 | 
         
            +
                  Verbosity
         
     | 
| 67 | 
         
            +
             
     | 
| 68 | 
         
            +
                mode: {'euclidean', 'cosine'}, default: 'euclidean'
         
     | 
| 69 | 
         
            +
                  Type of distance measure
         
     | 
| 70 | 
         
            +
                  
         
     | 
| 71 | 
         
            +
                init_method: {'random', 'point', '++'}
         
     | 
| 72 | 
         
            +
                  Type of initialization
         
     | 
| 73 | 
         
            +
             
     | 
| 74 | 
         
            +
                minibatch: {None, int}, default: None
         
     | 
| 75 | 
         
            +
                  Batch size of MinibatchKmeans algorithm
         
     | 
| 76 | 
         
            +
                  if None perform full KMeans algorithm
         
     | 
| 77 | 
         
            +
                  
         
     | 
| 78 | 
         
            +
              Attributes:
         
     | 
| 79 | 
         
            +
                centroids: torch.Tensor, shape: [n_clusters, n_features]
         
     | 
| 80 | 
         
            +
                  cluster centroids
         
     | 
| 81 | 
         
            +
              '''
         
     | 
| 82 | 
         
            +
              def __init__(self, n_clusters, max_iter=200, tol=1e-4, verbose=0, mode="euclidean",device=torch.device("cuda:0")):
         
     | 
| 83 | 
         
            +
                self.n_clusters = n_clusters
         
     | 
| 84 | 
         
            +
                self.max_iter = max_iter
         
     | 
| 85 | 
         
            +
                self.tol = tol
         
     | 
| 86 | 
         
            +
                self.verbose = verbose
         
     | 
| 87 | 
         
            +
                self.mode = mode
         
     | 
| 88 | 
         
            +
                self.device=device
         
     | 
| 89 | 
         
            +
                pynvml.nvmlInit()
         
     | 
| 90 | 
         
            +
                gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(device.index)
         
     | 
| 91 | 
         
            +
                info = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle)
         
     | 
| 92 | 
         
            +
                self.minibatch=int(33e6/self.n_clusters*info.free/ 1024 / 1024 / 1024)
         
     | 
| 93 | 
         
            +
                print("free_mem/GB:",info.free/ 1024 / 1024 / 1024,"minibatch:",self.minibatch)
         
     | 
| 94 | 
         
            +
                
         
     | 
| 95 | 
         
            +
              @staticmethod
         
     | 
| 96 | 
         
            +
              def cos_sim(a, b):
         
     | 
| 97 | 
         
            +
                """
         
     | 
| 98 | 
         
            +
                  Compute cosine similarity of 2 sets of vectors
         
     | 
| 99 | 
         
            +
             
     | 
| 100 | 
         
            +
                  Parameters:
         
     | 
| 101 | 
         
            +
                  a: torch.Tensor, shape: [m, n_features]
         
     | 
| 102 | 
         
            +
             
     | 
| 103 | 
         
            +
                  b: torch.Tensor, shape: [n, n_features]
         
     | 
| 104 | 
         
            +
                """
         
     | 
| 105 | 
         
            +
                return normalize(a, dim=-1) @ normalize(b, dim=-1).transpose(-2, -1)
         
     | 
| 106 | 
         
            +
             
     | 
| 107 | 
         
            +
              @staticmethod
         
     | 
| 108 | 
         
            +
              def euc_sim(a, b):
         
     | 
| 109 | 
         
            +
                """
         
     | 
| 110 | 
         
            +
                  Compute euclidean similarity of 2 sets of vectors
         
     | 
| 111 | 
         
            +
                  Parameters:
         
     | 
| 112 | 
         
            +
                  a: torch.Tensor, shape: [m, n_features]
         
     | 
| 113 | 
         
            +
                  b: torch.Tensor, shape: [n, n_features]
         
     | 
| 114 | 
         
            +
                """
         
     | 
| 115 | 
         
            +
                return 2 * a @ b.transpose(-2, -1) -(a**2).sum(dim=1)[..., :, None] - (b**2).sum(dim=1)[..., None, :]
         
     | 
| 116 | 
         
            +
             
     | 
| 117 | 
         
            +
              def max_sim(self, a, b):
         
     | 
| 118 | 
         
            +
                """
         
     | 
| 119 | 
         
            +
                  Compute maximum similarity (or minimum distance) of each vector
         
     | 
| 120 | 
         
            +
                  in a with all of the vectors in b
         
     | 
| 121 | 
         
            +
                  Parameters:
         
     | 
| 122 | 
         
            +
                  a: torch.Tensor, shape: [m, n_features]
         
     | 
| 123 | 
         
            +
                  b: torch.Tensor, shape: [n, n_features]
         
     | 
| 124 | 
         
            +
                """
         
     | 
| 125 | 
         
            +
                if self.mode == 'cosine':
         
     | 
| 126 | 
         
            +
                  sim_func = self.cos_sim
         
     | 
| 127 | 
         
            +
                elif self.mode == 'euclidean':
         
     | 
| 128 | 
         
            +
                  sim_func = self.euc_sim
         
     | 
| 129 | 
         
            +
                sim = sim_func(a, b)
         
     | 
| 130 | 
         
            +
                max_sim_v, max_sim_i = sim.max(dim=-1)
         
     | 
| 131 | 
         
            +
                return max_sim_v, max_sim_i
         
     | 
| 132 | 
         
            +
             
     | 
| 133 | 
         
            +
              def fit_predict(self, X):
         
     | 
| 134 | 
         
            +
                """
         
     | 
| 135 | 
         
            +
                  Combination of fit() and predict() methods.
         
     | 
| 136 | 
         
            +
                  This is faster than calling fit() and predict() seperately.
         
     | 
| 137 | 
         
            +
                  Parameters:
         
     | 
| 138 | 
         
            +
                  X: torch.Tensor, shape: [n_samples, n_features]
         
     | 
| 139 | 
         
            +
                  centroids: {torch.Tensor, None}, default: None
         
     | 
| 140 | 
         
            +
                    if given, centroids will be initialized with given tensor
         
     | 
| 141 | 
         
            +
                    if None, centroids will be randomly chosen from X
         
     | 
| 142 | 
         
            +
                  Return:
         
     | 
| 143 | 
         
            +
                  labels: torch.Tensor, shape: [n_samples]
         
     | 
| 144 | 
         
            +
             
     | 
| 145 | 
         
            +
                        mini_=33kk/k*remain
         
     | 
| 146 | 
         
            +
                        mini=min(mini_,fea_shape)
         
     | 
| 147 | 
         
            +
                        offset=log2(k/1000)*1.5
         
     | 
| 148 | 
         
            +
                        kpp_all=min(mini_*10/offset,fea_shape)
         
     | 
| 149 | 
         
            +
                        kpp_sample=min(mini_/12/offset,fea_shape)
         
     | 
| 150 | 
         
            +
                """
         
     | 
| 151 | 
         
            +
                assert isinstance(X, torch.Tensor), "input must be torch.Tensor"
         
     | 
| 152 | 
         
            +
                assert X.dtype in [torch.half, torch.float, torch.double], "input must be floating point"
         
     | 
| 153 | 
         
            +
                assert X.ndim == 2, "input must be a 2d tensor with shape: [n_samples, n_features] "
         
     | 
| 154 | 
         
            +
                # print("verbose:%s"%self.verbose)
         
     | 
| 155 | 
         
            +
             
     | 
| 156 | 
         
            +
                offset = np.power(1.5,np.log(self.n_clusters / 1000))/np.log(2)
         
     | 
| 157 | 
         
            +
                with torch.no_grad():
         
     | 
| 158 | 
         
            +
                  batch_size= X.shape[0]
         
     | 
| 159 | 
         
            +
                  # print(self.minibatch, int(self.minibatch * 10 / offset), batch_size)
         
     | 
| 160 | 
         
            +
                  start_time = time()
         
     | 
| 161 | 
         
            +
                  if (self.minibatch*10//offset< batch_size):
         
     | 
| 162 | 
         
            +
                    x = X[torch.randint(0, batch_size,[int(self.minibatch*10/offset)])].to(self.device)
         
     | 
| 163 | 
         
            +
                  else:
         
     | 
| 164 | 
         
            +
                    x = X.to(self.device)
         
     | 
| 165 | 
         
            +
                  # print(x.device)
         
     | 
| 166 | 
         
            +
                  self.centroids = _kpp(x, self.n_clusters, min(int(self.minibatch/12/offset),batch_size))
         
     | 
| 167 | 
         
            +
                  del x
         
     | 
| 168 | 
         
            +
                  torch.cuda.empty_cache()
         
     | 
| 169 | 
         
            +
                  # self.centroids = self.centroids.to(self.device)
         
     | 
| 170 | 
         
            +
                  num_points_in_clusters = torch.ones(self.n_clusters, device=self.device, dtype=X.dtype)#全1
         
     | 
| 171 | 
         
            +
                  closest = None#[3098036]#int64
         
     | 
| 172 | 
         
            +
                  if(self.minibatch>=batch_size//2 and self.minibatch<batch_size):
         
     | 
| 173 | 
         
            +
                    X = X[torch.randint(0, batch_size,[self.minibatch])].to(self.device)
         
     | 
| 174 | 
         
            +
                  elif(self.minibatch>=batch_size):
         
     | 
| 175 | 
         
            +
                    X=X.to(self.device)
         
     | 
| 176 | 
         
            +
                  for i in range(self.max_iter):
         
     | 
| 177 | 
         
            +
                    iter_time = time()
         
     | 
| 178 | 
         
            +
                    if self.minibatch<batch_size//2:#可用minibatch数太小,每次都得从内存倒腾到显存
         
     | 
| 179 | 
         
            +
                      x = X[torch.randint(0, batch_size, [self.minibatch])].to(self.device)
         
     | 
| 180 | 
         
            +
                    else:#否则直接全部缓存
         
     | 
| 181 | 
         
            +
                      x = X
         
     | 
| 182 | 
         
            +
             
     | 
| 183 | 
         
            +
                    closest = self.max_sim(a=x, b=self.centroids)[1].to(torch.int16)#[3098036]#int64#0~999
         
     | 
| 184 | 
         
            +
                    matched_clusters, counts = closest.unique(return_counts=True)#int64#1k
         
     | 
| 185 | 
         
            +
                    expanded_closest = closest[None].expand(self.n_clusters, -1)#[1000, 3098036]#int16#0~999
         
     | 
| 186 | 
         
            +
                    mask = (expanded_closest==torch.arange(self.n_clusters, device=self.device)[:, None]).to(X.dtype)#==后者是int64*1000
         
     | 
| 187 | 
         
            +
                    c_grad = mask @ x / mask.sum(-1)[..., :, None]
         
     | 
| 188 | 
         
            +
                    c_grad[c_grad!=c_grad] = 0 # remove NaNs
         
     | 
| 189 | 
         
            +
                    error = (c_grad - self.centroids).pow(2).sum()
         
     | 
| 190 | 
         
            +
                    if self.minibatch is not None:
         
     | 
| 191 | 
         
            +
                      lr = 1/num_points_in_clusters[:,None] * 0.9 + 0.1
         
     | 
| 192 | 
         
            +
                    else:
         
     | 
| 193 | 
         
            +
                      lr = 1
         
     | 
| 194 | 
         
            +
                    matched_clusters=matched_clusters.long()
         
     | 
| 195 | 
         
            +
                    num_points_in_clusters[matched_clusters] += counts#IndexError: tensors used as indices must be long, byte or bool tensors
         
     | 
| 196 | 
         
            +
                    self.centroids = self.centroids * (1-lr) + c_grad * lr
         
     | 
| 197 | 
         
            +
                    if self.verbose >= 2:
         
     | 
| 198 | 
         
            +
                      print('iter:', i, 'error:', error.item(), 'time spent:', round(time()-iter_time, 4))
         
     | 
| 199 | 
         
            +
                    if error <= self.tol:
         
     | 
| 200 | 
         
            +
                      break
         
     | 
| 201 | 
         
            +
             
     | 
| 202 | 
         
            +
                  if self.verbose >= 1:
         
     | 
| 203 | 
         
            +
                    print(f'used {i+1} iterations ({round(time()-start_time, 4)}s) to cluster {batch_size} items into {self.n_clusters} clusters')
         
     | 
| 204 | 
         
            +
                return closest
         
     | 
    	
        cluster/train_cluster.py
    ADDED
    
    | 
         @@ -0,0 +1,85 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            import argparse
         
     | 
| 2 | 
         
            +
            import logging
         
     | 
| 3 | 
         
            +
            import os
         
     | 
| 4 | 
         
            +
            import time
         
     | 
| 5 | 
         
            +
            from pathlib import Path
         
     | 
| 6 | 
         
            +
             
     | 
| 7 | 
         
            +
            import numpy as np
         
     | 
| 8 | 
         
            +
            import torch
         
     | 
| 9 | 
         
            +
            import tqdm
         
     | 
| 10 | 
         
            +
            from kmeans import KMeansGPU
         
     | 
| 11 | 
         
            +
            from sklearn.cluster import KMeans, MiniBatchKMeans
         
     | 
| 12 | 
         
            +
             
     | 
| 13 | 
         
            +
            logging.basicConfig(level=logging.INFO)
         
     | 
| 14 | 
         
            +
            logger = logging.getLogger(__name__)
         
     | 
| 15 | 
         
            +
             
     | 
| 16 | 
         
            +
            def train_cluster(in_dir, n_clusters, use_minibatch=True, verbose=False,use_gpu=False):#gpu_minibatch真拉,虽然库支持但是也不考虑
         
     | 
| 17 | 
         
            +
                if str(in_dir).endswith(".ipynb_checkpoints"):
         
     | 
| 18 | 
         
            +
                    logger.info(f"Ignore {in_dir}")
         
     | 
| 19 | 
         
            +
             
     | 
| 20 | 
         
            +
                logger.info(f"Loading features from {in_dir}")
         
     | 
| 21 | 
         
            +
                features = []
         
     | 
| 22 | 
         
            +
                nums = 0
         
     | 
| 23 | 
         
            +
                for path in tqdm.tqdm(in_dir.glob("*.soft.pt")):
         
     | 
| 24 | 
         
            +
                # for name in os.listdir(in_dir):
         
     | 
| 25 | 
         
            +
                #     path="%s/%s"%(in_dir,name)
         
     | 
| 26 | 
         
            +
                    features.append(torch.load(path,map_location="cpu").squeeze(0).numpy().T)
         
     | 
| 27 | 
         
            +
                    # print(features[-1].shape)
         
     | 
| 28 | 
         
            +
                features = np.concatenate(features, axis=0)
         
     | 
| 29 | 
         
            +
                print(nums, features.nbytes/ 1024**2, "MB , shape:",features.shape, features.dtype)
         
     | 
| 30 | 
         
            +
                features = features.astype(np.float32)
         
     | 
| 31 | 
         
            +
                logger.info(f"Clustering features of shape: {features.shape}")
         
     | 
| 32 | 
         
            +
                t = time.time()
         
     | 
| 33 | 
         
            +
                if(use_gpu is False):
         
     | 
| 34 | 
         
            +
                    if use_minibatch:
         
     | 
| 35 | 
         
            +
                        kmeans = MiniBatchKMeans(n_clusters=n_clusters,verbose=verbose, batch_size=4096, max_iter=80).fit(features)
         
     | 
| 36 | 
         
            +
                    else:
         
     | 
| 37 | 
         
            +
                        kmeans = KMeans(n_clusters=n_clusters,verbose=verbose).fit(features)
         
     | 
| 38 | 
         
            +
                else:
         
     | 
| 39 | 
         
            +
                        kmeans = KMeansGPU(n_clusters=n_clusters, mode='euclidean', verbose=2 if verbose else 0,max_iter=500,tol=1e-2)#
         
     | 
| 40 | 
         
            +
                        features=torch.from_numpy(features)#.to(device)
         
     | 
| 41 | 
         
            +
                        kmeans.fit_predict(features)#
         
     | 
| 42 | 
         
            +
             
     | 
| 43 | 
         
            +
                print(time.time()-t, "s")
         
     | 
| 44 | 
         
            +
             
     | 
| 45 | 
         
            +
                x = {
         
     | 
| 46 | 
         
            +
                        "n_features_in_": kmeans.n_features_in_ if use_gpu is False else features.shape[1],
         
     | 
| 47 | 
         
            +
                        "_n_threads": kmeans._n_threads if use_gpu is False else 4,
         
     | 
| 48 | 
         
            +
                        "cluster_centers_": kmeans.cluster_centers_ if use_gpu is False else kmeans.centroids.cpu().numpy(),
         
     | 
| 49 | 
         
            +
                }
         
     | 
| 50 | 
         
            +
                print("end")
         
     | 
| 51 | 
         
            +
             
     | 
| 52 | 
         
            +
                return x
         
     | 
| 53 | 
         
            +
             
     | 
| 54 | 
         
            +
            if __name__ == "__main__":
         
     | 
| 55 | 
         
            +
                parser = argparse.ArgumentParser()
         
     | 
| 56 | 
         
            +
                parser.add_argument('--dataset', type=Path, default="./dataset/44k",
         
     | 
| 57 | 
         
            +
                                    help='path of training data directory')
         
     | 
| 58 | 
         
            +
                parser.add_argument('--output', type=Path, default="logs/44k",
         
     | 
| 59 | 
         
            +
                                    help='path of model output directory')
         
     | 
| 60 | 
         
            +
                parser.add_argument('--gpu',action='store_true', default=False ,
         
     | 
| 61 | 
         
            +
                                    help='to use GPU')
         
     | 
| 62 | 
         
            +
             
     | 
| 63 | 
         
            +
             
     | 
| 64 | 
         
            +
                args = parser.parse_args()
         
     | 
| 65 | 
         
            +
             
     | 
| 66 | 
         
            +
                checkpoint_dir = args.output
         
     | 
| 67 | 
         
            +
                dataset = args.dataset
         
     | 
| 68 | 
         
            +
                use_gpu = args.gpu
         
     | 
| 69 | 
         
            +
                n_clusters = 10000
         
     | 
| 70 | 
         
            +
                
         
     | 
| 71 | 
         
            +
                ckpt = {}
         
     | 
| 72 | 
         
            +
                for spk in os.listdir(dataset):
         
     | 
| 73 | 
         
            +
                    if os.path.isdir(dataset/spk):
         
     | 
| 74 | 
         
            +
                        print(f"train kmeans for {spk}...")
         
     | 
| 75 | 
         
            +
                        in_dir = dataset/spk
         
     | 
| 76 | 
         
            +
                        x = train_cluster(in_dir, n_clusters,use_minibatch=False,verbose=False,use_gpu=use_gpu)
         
     | 
| 77 | 
         
            +
                        ckpt[spk] = x
         
     | 
| 78 | 
         
            +
             
     | 
| 79 | 
         
            +
                checkpoint_path = checkpoint_dir / f"kmeans_{n_clusters}.pt"
         
     | 
| 80 | 
         
            +
                checkpoint_path.parent.mkdir(exist_ok=True, parents=True)
         
     | 
| 81 | 
         
            +
                torch.save(
         
     | 
| 82 | 
         
            +
                    ckpt,
         
     | 
| 83 | 
         
            +
                    checkpoint_path,
         
     | 
| 84 | 
         
            +
                )
         
     | 
| 85 | 
         
            +
                
         
     | 
    	
        configs/config.json
    ADDED
    
    | 
         @@ -0,0 +1,94 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            {
         
     | 
| 2 | 
         
            +
                "train": {
         
     | 
| 3 | 
         
            +
                    "log_interval": 20,
         
     | 
| 4 | 
         
            +
                    "eval_interval": 20,
         
     | 
| 5 | 
         
            +
                    "seed": 1234,
         
     | 
| 6 | 
         
            +
                    "epochs": 10000,
         
     | 
| 7 | 
         
            +
                    "learning_rate": 0.0001,
         
     | 
| 8 | 
         
            +
                    "betas": [
         
     | 
| 9 | 
         
            +
                        0.8,
         
     | 
| 10 | 
         
            +
                        0.99
         
     | 
| 11 | 
         
            +
                    ],
         
     | 
| 12 | 
         
            +
                    "eps": 1e-09,
         
     | 
| 13 | 
         
            +
                    "batch_size": 6,
         
     | 
| 14 | 
         
            +
                    "fp16_run": false,
         
     | 
| 15 | 
         
            +
                    "lr_decay": 0.999875,
         
     | 
| 16 | 
         
            +
                    "segment_size": 10240,
         
     | 
| 17 | 
         
            +
                    "init_lr_ratio": 1,
         
     | 
| 18 | 
         
            +
                    "warmup_epochs": 0,
         
     | 
| 19 | 
         
            +
                    "c_mel": 45,
         
     | 
| 20 | 
         
            +
                    "c_kl": 1.0,
         
     | 
| 21 | 
         
            +
                    "use_sr": true,
         
     | 
| 22 | 
         
            +
                    "max_speclen": 512,
         
     | 
| 23 | 
         
            +
                    "port": "8001",
         
     | 
| 24 | 
         
            +
                    "keep_ckpts": 3
         
     | 
| 25 | 
         
            +
                },
         
     | 
| 26 | 
         
            +
                "data": {
         
     | 
| 27 | 
         
            +
                    "training_files": "filelists/train.txt",
         
     | 
| 28 | 
         
            +
                    "validation_files": "filelists/val.txt",
         
     | 
| 29 | 
         
            +
                    "max_wav_value": 32768.0,
         
     | 
| 30 | 
         
            +
                    "sampling_rate": 44100,
         
     | 
| 31 | 
         
            +
                    "filter_length": 2048,
         
     | 
| 32 | 
         
            +
                    "hop_length": 512,
         
     | 
| 33 | 
         
            +
                    "win_length": 2048,
         
     | 
| 34 | 
         
            +
                    "n_mel_channels": 80,
         
     | 
| 35 | 
         
            +
                    "mel_fmin": 0.0,
         
     | 
| 36 | 
         
            +
                    "mel_fmax": 22050
         
     | 
| 37 | 
         
            +
                },
         
     | 
| 38 | 
         
            +
                "model": {
         
     | 
| 39 | 
         
            +
                    "inter_channels": 192,
         
     | 
| 40 | 
         
            +
                    "hidden_channels": 192,
         
     | 
| 41 | 
         
            +
                    "filter_channels": 768,
         
     | 
| 42 | 
         
            +
                    "n_heads": 2,
         
     | 
| 43 | 
         
            +
                    "n_layers": 6,
         
     | 
| 44 | 
         
            +
                    "kernel_size": 3,
         
     | 
| 45 | 
         
            +
                    "p_dropout": 0.1,
         
     | 
| 46 | 
         
            +
                    "resblock": "1",
         
     | 
| 47 | 
         
            +
                    "resblock_kernel_sizes": [
         
     | 
| 48 | 
         
            +
                        3,
         
     | 
| 49 | 
         
            +
                        7,
         
     | 
| 50 | 
         
            +
                        11
         
     | 
| 51 | 
         
            +
                    ],
         
     | 
| 52 | 
         
            +
                    "resblock_dilation_sizes": [
         
     | 
| 53 | 
         
            +
                        [
         
     | 
| 54 | 
         
            +
                            1,
         
     | 
| 55 | 
         
            +
                            3,
         
     | 
| 56 | 
         
            +
                            5
         
     | 
| 57 | 
         
            +
                        ],
         
     | 
| 58 | 
         
            +
                        [
         
     | 
| 59 | 
         
            +
                            1,
         
     | 
| 60 | 
         
            +
                            3,
         
     | 
| 61 | 
         
            +
                            5
         
     | 
| 62 | 
         
            +
                        ],
         
     | 
| 63 | 
         
            +
                        [
         
     | 
| 64 | 
         
            +
                            1,
         
     | 
| 65 | 
         
            +
                            3,
         
     | 
| 66 | 
         
            +
                            5
         
     | 
| 67 | 
         
            +
                        ]
         
     | 
| 68 | 
         
            +
                    ],
         
     | 
| 69 | 
         
            +
                    "upsample_rates": [
         
     | 
| 70 | 
         
            +
                        8,
         
     | 
| 71 | 
         
            +
                        8,
         
     | 
| 72 | 
         
            +
                        2,
         
     | 
| 73 | 
         
            +
                        2,
         
     | 
| 74 | 
         
            +
                        2
         
     | 
| 75 | 
         
            +
                    ],
         
     | 
| 76 | 
         
            +
                    "upsample_initial_channel": 512,
         
     | 
| 77 | 
         
            +
                    "upsample_kernel_sizes": [
         
     | 
| 78 | 
         
            +
                        16,
         
     | 
| 79 | 
         
            +
                        16,
         
     | 
| 80 | 
         
            +
                        4,
         
     | 
| 81 | 
         
            +
                        4,
         
     | 
| 82 | 
         
            +
                        4
         
     | 
| 83 | 
         
            +
                    ],
         
     | 
| 84 | 
         
            +
                    "n_layers_q": 3,
         
     | 
| 85 | 
         
            +
                    "use_spectral_norm": false,
         
     | 
| 86 | 
         
            +
                    "gin_channels": 256,
         
     | 
| 87 | 
         
            +
                    "ssl_dim": 256,
         
     | 
| 88 | 
         
            +
                    "n_speakers": 200,
         
     | 
| 89 | 
         
            +
                    "speech_encoder": "vec256l9"
         
     | 
| 90 | 
         
            +
                },
         
     | 
| 91 | 
         
            +
                "spk": {
         
     | 
| 92 | 
         
            +
                    "Shengshuyan": 0
         
     | 
| 93 | 
         
            +
                }
         
     | 
| 94 | 
         
            +
            }
         
     | 
    	
        configs/diffusion.yaml
    ADDED
    
    | 
         @@ -0,0 +1,48 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            data:
         
     | 
| 2 | 
         
            +
              sampling_rate: 44100
         
     | 
| 3 | 
         
            +
              block_size: 512 # Equal to hop_length
         
     | 
| 4 | 
         
            +
              duration: 2 # Audio duration during training, must be less than the duration of the shortest audio clip
         
     | 
| 5 | 
         
            +
              encoder: 'vec768l12' # 'hubertsoft', 'vec256l9', 'vec768l12'
         
     | 
| 6 | 
         
            +
              cnhubertsoft_gate: 10
         
     | 
| 7 | 
         
            +
              encoder_sample_rate: 16000
         
     | 
| 8 | 
         
            +
              encoder_hop_size: 320
         
     | 
| 9 | 
         
            +
              encoder_out_channels: 768 # 256 if using 'hubertsoft'
         
     | 
| 10 | 
         
            +
              training_files: "filelists/train.txt"
         
     | 
| 11 | 
         
            +
              validation_files: "filelists/val.txt"
         
     | 
| 12 | 
         
            +
              extensions: # List of extension included in the data collection
         
     | 
| 13 | 
         
            +
                - wav
         
     | 
| 14 | 
         
            +
            model:
         
     | 
| 15 | 
         
            +
              type: 'Diffusion'
         
     | 
| 16 | 
         
            +
              n_layers: 20
         
     | 
| 17 | 
         
            +
              n_chans: 512
         
     | 
| 18 | 
         
            +
              n_hidden: 256
         
     | 
| 19 | 
         
            +
              use_pitch_aug: true  
         
     | 
| 20 | 
         
            +
              n_spk: 1 # max number of different speakers
         
     | 
| 21 | 
         
            +
            device: cuda
         
     | 
| 22 | 
         
            +
            vocoder:
         
     | 
| 23 | 
         
            +
              type: 'nsf-hifigan'
         
     | 
| 24 | 
         
            +
              ckpt: 'pretrain/nsf_hifigan/model'
         
     | 
| 25 | 
         
            +
            infer:
         
     | 
| 26 | 
         
            +
              speedup: 10
         
     | 
| 27 | 
         
            +
              method: 'dpm-solver' # 'pndm' or 'dpm-solver'
         
     | 
| 28 | 
         
            +
            env:
         
     | 
| 29 | 
         
            +
              expdir: logs/44k/diffusion
         
     | 
| 30 | 
         
            +
              gpu_id: 0
         
     | 
| 31 | 
         
            +
            train:
         
     | 
| 32 | 
         
            +
              num_workers: 2 # If your cpu and gpu are both very strong, set to 0 may be faster!
         
     | 
| 33 | 
         
            +
              amp_dtype: fp32 # fp32, fp16 or bf16 (fp16 or bf16 may be faster if it is supported by your gpu)
         
     | 
| 34 | 
         
            +
              batch_size: 48
         
     | 
| 35 | 
         
            +
              cache_all_data: true # Save Internal-Memory or Graphics-Memory if it is false, but may be slow
         
     | 
| 36 | 
         
            +
              cache_device: 'cpu' # Set to 'cuda' to cache the data into the Graphics-Memory, fastest speed for strong gpu
         
     | 
| 37 | 
         
            +
              cache_fp16: true
         
     | 
| 38 | 
         
            +
              epochs: 100000
         
     | 
| 39 | 
         
            +
              interval_log: 10
         
     | 
| 40 | 
         
            +
              interval_val: 2000
         
     | 
| 41 | 
         
            +
              interval_force_save: 10000
         
     | 
| 42 | 
         
            +
              lr: 0.0002
         
     | 
| 43 | 
         
            +
              decay_step: 100000
         
     | 
| 44 | 
         
            +
              gamma: 0.5
         
     | 
| 45 | 
         
            +
              weight_decay: 0
         
     | 
| 46 | 
         
            +
              save_opt: false
         
     | 
| 47 | 
         
            +
            spk:
         
     | 
| 48 | 
         
            +
              'nyaru': 0
         
     | 
    	
        configs_template/config_template.json
    ADDED
    
    | 
         @@ -0,0 +1,77 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            {
         
     | 
| 2 | 
         
            +
              "train": {
         
     | 
| 3 | 
         
            +
                "log_interval": 200,
         
     | 
| 4 | 
         
            +
                "eval_interval": 800,
         
     | 
| 5 | 
         
            +
                "seed": 1234,
         
     | 
| 6 | 
         
            +
                "epochs": 10000,
         
     | 
| 7 | 
         
            +
                "learning_rate": 0.0001,
         
     | 
| 8 | 
         
            +
                "betas": [
         
     | 
| 9 | 
         
            +
                  0.8,
         
     | 
| 10 | 
         
            +
                  0.99
         
     | 
| 11 | 
         
            +
                ],
         
     | 
| 12 | 
         
            +
                "eps": 1e-09,
         
     | 
| 13 | 
         
            +
                "batch_size": 6,
         
     | 
| 14 | 
         
            +
                "fp16_run": false,
         
     | 
| 15 | 
         
            +
                "half_type": "fp16",
         
     | 
| 16 | 
         
            +
                "lr_decay": 0.999875,
         
     | 
| 17 | 
         
            +
                "segment_size": 10240,
         
     | 
| 18 | 
         
            +
                "init_lr_ratio": 1,
         
     | 
| 19 | 
         
            +
                "warmup_epochs": 0,
         
     | 
| 20 | 
         
            +
                "c_mel": 45,
         
     | 
| 21 | 
         
            +
                "c_kl": 1.0,
         
     | 
| 22 | 
         
            +
                "use_sr": true,
         
     | 
| 23 | 
         
            +
                "max_speclen": 512,
         
     | 
| 24 | 
         
            +
                "port": "8001",
         
     | 
| 25 | 
         
            +
                "keep_ckpts": 3,
         
     | 
| 26 | 
         
            +
                "all_in_mem": false,
         
     | 
| 27 | 
         
            +
                "vol_aug":false
         
     | 
| 28 | 
         
            +
              },
         
     | 
| 29 | 
         
            +
              "data": {
         
     | 
| 30 | 
         
            +
                "training_files": "filelists/train.txt",
         
     | 
| 31 | 
         
            +
                "validation_files": "filelists/val.txt",
         
     | 
| 32 | 
         
            +
                "max_wav_value": 32768.0,
         
     | 
| 33 | 
         
            +
                "sampling_rate": 44100,
         
     | 
| 34 | 
         
            +
                "filter_length": 2048,
         
     | 
| 35 | 
         
            +
                "hop_length": 512,
         
     | 
| 36 | 
         
            +
                "win_length": 2048,
         
     | 
| 37 | 
         
            +
                "n_mel_channels": 80,
         
     | 
| 38 | 
         
            +
                "mel_fmin": 0.0,
         
     | 
| 39 | 
         
            +
                "mel_fmax": 22050,
         
     | 
| 40 | 
         
            +
                "unit_interpolate_mode":"nearest"
         
     | 
| 41 | 
         
            +
              },
         
     | 
| 42 | 
         
            +
              "model": {
         
     | 
| 43 | 
         
            +
                "inter_channels": 192,
         
     | 
| 44 | 
         
            +
                "hidden_channels": 192,
         
     | 
| 45 | 
         
            +
                "filter_channels": 768,
         
     | 
| 46 | 
         
            +
                "n_heads": 2,
         
     | 
| 47 | 
         
            +
                "n_layers": 6,
         
     | 
| 48 | 
         
            +
                "kernel_size": 3,
         
     | 
| 49 | 
         
            +
                "p_dropout": 0.1,
         
     | 
| 50 | 
         
            +
                "resblock": "1",
         
     | 
| 51 | 
         
            +
                "resblock_kernel_sizes": [3,7,11],
         
     | 
| 52 | 
         
            +
                "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
         
     | 
| 53 | 
         
            +
                "upsample_rates": [ 8, 8, 2, 2, 2],
         
     | 
| 54 | 
         
            +
                "upsample_initial_channel": 512,
         
     | 
| 55 | 
         
            +
                "upsample_kernel_sizes": [16,16, 4, 4, 4],
         
     | 
| 56 | 
         
            +
                "n_layers_q": 3,
         
     | 
| 57 | 
         
            +
                "n_flow_layer": 4,
         
     | 
| 58 | 
         
            +
                "use_spectral_norm": false,
         
     | 
| 59 | 
         
            +
                "gin_channels": 768,
         
     | 
| 60 | 
         
            +
                "ssl_dim": 768,
         
     | 
| 61 | 
         
            +
                "n_speakers": 200,
         
     | 
| 62 | 
         
            +
                "vocoder_name":"nsf-hifigan",
         
     | 
| 63 | 
         
            +
                "speech_encoder":"vec768l12",
         
     | 
| 64 | 
         
            +
                "speaker_embedding":false,
         
     | 
| 65 | 
         
            +
                "vol_embedding":false,
         
     | 
| 66 | 
         
            +
                "use_depthwise_conv":false,
         
     | 
| 67 | 
         
            +
                "flow_share_parameter": false,
         
     | 
| 68 | 
         
            +
                "use_automatic_f0_prediction": true
         
     | 
| 69 | 
         
            +
              },
         
     | 
| 70 | 
         
            +
              "spk": {
         
     | 
| 71 | 
         
            +
                "nyaru": 0,
         
     | 
| 72 | 
         
            +
                "huiyu": 1,
         
     | 
| 73 | 
         
            +
                "nen": 2,
         
     | 
| 74 | 
         
            +
                "paimon": 3,
         
     | 
| 75 | 
         
            +
                "yunhao": 4
         
     | 
| 76 | 
         
            +
              }
         
     | 
| 77 | 
         
            +
            }
         
     | 
    	
        configs_template/config_tiny_template.json
    ADDED
    
    | 
         @@ -0,0 +1,77 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            {
         
     | 
| 2 | 
         
            +
              "train": {
         
     | 
| 3 | 
         
            +
                "log_interval": 200,
         
     | 
| 4 | 
         
            +
                "eval_interval": 800,
         
     | 
| 5 | 
         
            +
                "seed": 1234,
         
     | 
| 6 | 
         
            +
                "epochs": 10000,
         
     | 
| 7 | 
         
            +
                "learning_rate": 0.0001,
         
     | 
| 8 | 
         
            +
                "betas": [
         
     | 
| 9 | 
         
            +
                  0.8,
         
     | 
| 10 | 
         
            +
                  0.99
         
     | 
| 11 | 
         
            +
                ],
         
     | 
| 12 | 
         
            +
                "eps": 1e-09,
         
     | 
| 13 | 
         
            +
                "batch_size": 6,
         
     | 
| 14 | 
         
            +
                "fp16_run": false,
         
     | 
| 15 | 
         
            +
                "half_type": "fp16",
         
     | 
| 16 | 
         
            +
                "lr_decay": 0.999875,
         
     | 
| 17 | 
         
            +
                "segment_size": 10240,
         
     | 
| 18 | 
         
            +
                "init_lr_ratio": 1,
         
     | 
| 19 | 
         
            +
                "warmup_epochs": 0,
         
     | 
| 20 | 
         
            +
                "c_mel": 45,
         
     | 
| 21 | 
         
            +
                "c_kl": 1.0,
         
     | 
| 22 | 
         
            +
                "use_sr": true,
         
     | 
| 23 | 
         
            +
                "max_speclen": 512,
         
     | 
| 24 | 
         
            +
                "port": "8001",
         
     | 
| 25 | 
         
            +
                "keep_ckpts": 3,
         
     | 
| 26 | 
         
            +
                "all_in_mem": false,
         
     | 
| 27 | 
         
            +
                "vol_aug":false
         
     | 
| 28 | 
         
            +
              },
         
     | 
| 29 | 
         
            +
              "data": {
         
     | 
| 30 | 
         
            +
                "training_files": "filelists/train.txt",
         
     | 
| 31 | 
         
            +
                "validation_files": "filelists/val.txt",
         
     | 
| 32 | 
         
            +
                "max_wav_value": 32768.0,
         
     | 
| 33 | 
         
            +
                "sampling_rate": 44100,
         
     | 
| 34 | 
         
            +
                "filter_length": 2048,
         
     | 
| 35 | 
         
            +
                "hop_length": 512,
         
     | 
| 36 | 
         
            +
                "win_length": 2048,
         
     | 
| 37 | 
         
            +
                "n_mel_channels": 80,
         
     | 
| 38 | 
         
            +
                "mel_fmin": 0.0,
         
     | 
| 39 | 
         
            +
                "mel_fmax": 22050,
         
     | 
| 40 | 
         
            +
                "unit_interpolate_mode":"nearest"
         
     | 
| 41 | 
         
            +
              },
         
     | 
| 42 | 
         
            +
              "model": {
         
     | 
| 43 | 
         
            +
                "inter_channels": 192,
         
     | 
| 44 | 
         
            +
                "hidden_channels": 192,
         
     | 
| 45 | 
         
            +
                "filter_channels": 512,
         
     | 
| 46 | 
         
            +
                "n_heads": 2,
         
     | 
| 47 | 
         
            +
                "n_layers": 6,
         
     | 
| 48 | 
         
            +
                "kernel_size": 3,
         
     | 
| 49 | 
         
            +
                "p_dropout": 0.1,
         
     | 
| 50 | 
         
            +
                "resblock": "1",
         
     | 
| 51 | 
         
            +
                "resblock_kernel_sizes": [3,7,11],
         
     | 
| 52 | 
         
            +
                "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
         
     | 
| 53 | 
         
            +
                "upsample_rates": [ 8, 8, 2, 2, 2],
         
     | 
| 54 | 
         
            +
                "upsample_initial_channel": 400,
         
     | 
| 55 | 
         
            +
                "upsample_kernel_sizes": [16,16, 4, 4, 4],
         
     | 
| 56 | 
         
            +
                "n_layers_q": 3,
         
     | 
| 57 | 
         
            +
                "n_flow_layer": 4,
         
     | 
| 58 | 
         
            +
                "use_spectral_norm": false,
         
     | 
| 59 | 
         
            +
                "gin_channels": 768,
         
     | 
| 60 | 
         
            +
                "ssl_dim": 768,
         
     | 
| 61 | 
         
            +
                "n_speakers": 200,
         
     | 
| 62 | 
         
            +
                "vocoder_name":"nsf-hifigan",
         
     | 
| 63 | 
         
            +
                "speech_encoder":"vec768l12",
         
     | 
| 64 | 
         
            +
                "speaker_embedding":false,
         
     | 
| 65 | 
         
            +
                "vol_embedding":false,
         
     | 
| 66 | 
         
            +
                "use_depthwise_conv":true,
         
     | 
| 67 | 
         
            +
                "flow_share_parameter": true,
         
     | 
| 68 | 
         
            +
                "use_automatic_f0_prediction": true
         
     | 
| 69 | 
         
            +
              },
         
     | 
| 70 | 
         
            +
              "spk": {
         
     | 
| 71 | 
         
            +
                "nyaru": 0,
         
     | 
| 72 | 
         
            +
                "huiyu": 1,
         
     | 
| 73 | 
         
            +
                "nen": 2,
         
     | 
| 74 | 
         
            +
                "paimon": 3,
         
     | 
| 75 | 
         
            +
                "yunhao": 4
         
     | 
| 76 | 
         
            +
              }
         
     | 
| 77 | 
         
            +
            }
         
     | 
    	
        configs_template/diffusion_template.yaml
    ADDED
    
    | 
         @@ -0,0 +1,51 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            data:
         
     | 
| 2 | 
         
            +
              sampling_rate: 44100
         
     | 
| 3 | 
         
            +
              block_size: 512 # Equal to hop_length
         
     | 
| 4 | 
         
            +
              duration: 2 # Audio duration during training, must be less than the duration of the shortest audio clip
         
     | 
| 5 | 
         
            +
              encoder: 'vec768l12' # 'hubertsoft', 'vec256l9', 'vec768l12'
         
     | 
| 6 | 
         
            +
              cnhubertsoft_gate: 10
         
     | 
| 7 | 
         
            +
              encoder_sample_rate: 16000
         
     | 
| 8 | 
         
            +
              encoder_hop_size: 320
         
     | 
| 9 | 
         
            +
              encoder_out_channels: 768 # 256 if using 'hubertsoft'
         
     | 
| 10 | 
         
            +
              training_files: "filelists/train.txt"
         
     | 
| 11 | 
         
            +
              validation_files: "filelists/val.txt"
         
     | 
| 12 | 
         
            +
              extensions: # List of extension included in the data collection
         
     | 
| 13 | 
         
            +
                - wav
         
     | 
| 14 | 
         
            +
              unit_interpolate_mode: "nearest"
         
     | 
| 15 | 
         
            +
            model:
         
     | 
| 16 | 
         
            +
              type: 'Diffusion'
         
     | 
| 17 | 
         
            +
              n_layers: 20
         
     | 
| 18 | 
         
            +
              n_chans: 512
         
     | 
| 19 | 
         
            +
              n_hidden: 256
         
     | 
| 20 | 
         
            +
              use_pitch_aug: true
         
     | 
| 21 | 
         
            +
              timesteps : 1000
         
     | 
| 22 | 
         
            +
              k_step_max: 0 # must <= timesteps, If it is 0, train all
         
     | 
| 23 | 
         
            +
              n_spk: 1 # max number of different speakers
         
     | 
| 24 | 
         
            +
            device: cuda
         
     | 
| 25 | 
         
            +
            vocoder:
         
     | 
| 26 | 
         
            +
              type: 'nsf-hifigan'
         
     | 
| 27 | 
         
            +
              ckpt: 'pretrain/nsf_hifigan/model'
         
     | 
| 28 | 
         
            +
            infer:
         
     | 
| 29 | 
         
            +
              speedup: 10
         
     | 
| 30 | 
         
            +
              method: 'dpm-solver++' # 'pndm' or 'dpm-solver' or 'ddim' or 'unipc' or 'dpm-solver++'
         
     | 
| 31 | 
         
            +
            env:
         
     | 
| 32 | 
         
            +
              expdir: logs/44k/diffusion
         
     | 
| 33 | 
         
            +
              gpu_id: 0
         
     | 
| 34 | 
         
            +
            train:
         
     | 
| 35 | 
         
            +
              num_workers: 4 # If your cpu and gpu are both very strong, set to 0 may be faster!
         
     | 
| 36 | 
         
            +
              amp_dtype: fp32 # fp32, fp16 or bf16 (fp16 or bf16 may be faster if it is supported by your gpu)
         
     | 
| 37 | 
         
            +
              batch_size: 48
         
     | 
| 38 | 
         
            +
              cache_all_data: true # Save Internal-Memory or Graphics-Memory if it is false, but may be slow
         
     | 
| 39 | 
         
            +
              cache_device: 'cpu' # Set to 'cuda' to cache the data into the Graphics-Memory, fastest speed for strong gpu
         
     | 
| 40 | 
         
            +
              cache_fp16: true
         
     | 
| 41 | 
         
            +
              epochs: 100000
         
     | 
| 42 | 
         
            +
              interval_log: 10
         
     | 
| 43 | 
         
            +
              interval_val: 2000
         
     | 
| 44 | 
         
            +
              interval_force_save: 5000
         
     | 
| 45 | 
         
            +
              lr: 0.0001
         
     | 
| 46 | 
         
            +
              decay_step: 100000
         
     | 
| 47 | 
         
            +
              gamma: 0.5
         
     | 
| 48 | 
         
            +
              weight_decay: 0
         
     | 
| 49 | 
         
            +
              save_opt: false
         
     | 
| 50 | 
         
            +
            spk:
         
     | 
| 51 | 
         
            +
              'nyaru': 0
         
     | 
    	
        dataset_raw/wav_structure.txt
    ADDED
    
    | 
         @@ -0,0 +1,20 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            数据集准备
         
     | 
| 2 | 
         
            +
             
     | 
| 3 | 
         
            +
            raw
         
     | 
| 4 | 
         
            +
            ├───speaker0
         
     | 
| 5 | 
         
            +
            │   ├───xxx1-xxx1.wav
         
     | 
| 6 | 
         
            +
            │   ├───...
         
     | 
| 7 | 
         
            +
            │   └───Lxx-0xx8.wav
         
     | 
| 8 | 
         
            +
            └───speaker1
         
     | 
| 9 | 
         
            +
                ├───xx2-0xxx2.wav
         
     | 
| 10 | 
         
            +
                ├───...
         
     | 
| 11 | 
         
            +
                └───xxx7-xxx007.wav
         
     | 
| 12 | 
         
            +
             
     | 
| 13 | 
         
            +
            此外还需要编辑config.json
         
     | 
| 14 | 
         
            +
             
     | 
| 15 | 
         
            +
            "n_speakers": 10
         
     | 
| 16 | 
         
            +
             
     | 
| 17 | 
         
            +
            "spk":{
         
     | 
| 18 | 
         
            +
                "speaker0": 0,
         
     | 
| 19 | 
         
            +
                "speaker1": 1,
         
     | 
| 20 | 
         
            +
            }
         
     |