VampPluginSDK
2.1
|
00001 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ 00002 00003 /* 00004 Vamp 00005 00006 An API for audio analysis and feature extraction plugins. 00007 00008 Centre for Digital Music, Queen Mary, University of London. 00009 Copyright 2006-2009 Chris Cannam and QMUL. 00010 00011 Permission is hereby granted, free of charge, to any person 00012 obtaining a copy of this software and associated documentation 00013 files (the "Software"), to deal in the Software without 00014 restriction, including without limitation the rights to use, copy, 00015 modify, merge, publish, distribute, sublicense, and/or sell copies 00016 of the Software, and to permit persons to whom the Software is 00017 furnished to do so, subject to the following conditions: 00018 00019 The above copyright notice and this permission notice shall be 00020 included in all copies or substantial portions of the Software. 00021 00022 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 00023 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 00024 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 00025 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 00026 ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF 00027 CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 00028 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 00029 00030 Except as contained in this notice, the names of the Centre for 00031 Digital Music; Queen Mary, University of London; and Chris Cannam 00032 shall not be used in advertising or otherwise to promote the sale, 00033 use or other dealings in this Software without prior written 00034 authorization. 00035 */ 00036 00037 #include "FixedTempoEstimator.h" 00038 00039 using std::string; 00040 using std::vector; 00041 using std::cerr; 00042 using std::endl; 00043 00044 using Vamp::RealTime; 00045 00046 #include <cmath> 00047 #include <cstdio> 00048 00049 00050 class FixedTempoEstimator::D 00051 // this class just avoids us having to declare any data members in the header 00052 { 00053 public: 00054 D(float inputSampleRate); 00055 ~D(); 00056 00057 size_t getPreferredStepSize() const { return 64; } 00058 size_t getPreferredBlockSize() const { return 256; } 00059 00060 ParameterList getParameterDescriptors() const; 00061 float getParameter(string id) const; 00062 void setParameter(string id, float value); 00063 00064 OutputList getOutputDescriptors() const; 00065 00066 bool initialise(size_t channels, size_t stepSize, size_t blockSize); 00067 void reset(); 00068 FeatureSet process(const float *const *, RealTime); 00069 FeatureSet getRemainingFeatures(); 00070 00071 private: 00072 void calculate(); 00073 FeatureSet assembleFeatures(); 00074 00075 float lag2tempo(int); 00076 int tempo2lag(float); 00077 00078 float m_inputSampleRate; 00079 size_t m_stepSize; 00080 size_t m_blockSize; 00081 00082 float m_minbpm; 00083 float m_maxbpm; 00084 float m_maxdflen; 00085 00086 float *m_priorMagnitudes; 00087 00088 size_t m_dfsize; 00089 float *m_df; 00090 float *m_r; 00091 float *m_fr; 00092 float *m_t; 00093 size_t m_n; 00094 00095 Vamp::RealTime m_start; 00096 Vamp::RealTime m_lasttime; 00097 }; 00098 00099 FixedTempoEstimator::D::D(float inputSampleRate) : 00100 m_inputSampleRate(inputSampleRate), 00101 m_stepSize(0), 00102 m_blockSize(0), 00103 m_minbpm(50), 00104 m_maxbpm(190), 00105 m_maxdflen(10), 00106 m_priorMagnitudes(0), 00107 m_df(0), 00108 m_r(0), 00109 m_fr(0), 00110 m_t(0), 00111 m_n(0) 00112 { 00113 } 00114 00115 FixedTempoEstimator::D::~D() 00116 { 00117 delete[] m_priorMagnitudes; 00118 delete[] m_df; 00119 delete[] m_r; 00120 delete[] m_fr; 00121 delete[] m_t; 00122 } 00123 00124 FixedTempoEstimator::ParameterList 00125 FixedTempoEstimator::D::getParameterDescriptors() const 00126 { 00127 ParameterList list; 00128 00129 ParameterDescriptor d; 00130 d.identifier = "minbpm"; 00131 d.name = "Minimum estimated tempo"; 00132 d.description = "Minimum beat-per-minute value which the tempo estimator is able to return"; 00133 d.unit = "bpm"; 00134 d.minValue = 10; 00135 d.maxValue = 360; 00136 d.defaultValue = 50; 00137 d.isQuantized = false; 00138 list.push_back(d); 00139 00140 d.identifier = "maxbpm"; 00141 d.name = "Maximum estimated tempo"; 00142 d.description = "Maximum beat-per-minute value which the tempo estimator is able to return"; 00143 d.defaultValue = 190; 00144 list.push_back(d); 00145 00146 d.identifier = "maxdflen"; 00147 d.name = "Input duration to study"; 00148 d.description = "Length of audio input, in seconds, which should be taken into account when estimating tempo. There is no need to supply the plugin with any further input once this time has elapsed since the start of the audio. The tempo estimator may use only the first part of this, up to eight times the slowest beat duration: increasing this value further than that is unlikely to improve results."; 00149 d.unit = "s"; 00150 d.minValue = 2; 00151 d.maxValue = 40; 00152 d.defaultValue = 10; 00153 list.push_back(d); 00154 00155 return list; 00156 } 00157 00158 float 00159 FixedTempoEstimator::D::getParameter(string id) const 00160 { 00161 if (id == "minbpm") { 00162 return m_minbpm; 00163 } else if (id == "maxbpm") { 00164 return m_maxbpm; 00165 } else if (id == "maxdflen") { 00166 return m_maxdflen; 00167 } 00168 return 0.f; 00169 } 00170 00171 void 00172 FixedTempoEstimator::D::setParameter(string id, float value) 00173 { 00174 if (id == "minbpm") { 00175 m_minbpm = value; 00176 } else if (id == "maxbpm") { 00177 m_maxbpm = value; 00178 } else if (id == "maxdflen") { 00179 m_maxdflen = value; 00180 } 00181 } 00182 00183 static int TempoOutput = 0; 00184 static int CandidatesOutput = 1; 00185 static int DFOutput = 2; 00186 static int ACFOutput = 3; 00187 static int FilteredACFOutput = 4; 00188 00189 FixedTempoEstimator::OutputList 00190 FixedTempoEstimator::D::getOutputDescriptors() const 00191 { 00192 OutputList list; 00193 00194 OutputDescriptor d; 00195 d.identifier = "tempo"; 00196 d.name = "Tempo"; 00197 d.description = "Estimated tempo"; 00198 d.unit = "bpm"; 00199 d.hasFixedBinCount = true; 00200 d.binCount = 1; 00201 d.hasKnownExtents = false; 00202 d.isQuantized = false; 00203 d.sampleType = OutputDescriptor::VariableSampleRate; 00204 d.sampleRate = m_inputSampleRate; 00205 d.hasDuration = true; // our returned tempo spans a certain range 00206 list.push_back(d); 00207 00208 d.identifier = "candidates"; 00209 d.name = "Tempo candidates"; 00210 d.description = "Possible tempo estimates, one per bin with the most likely in the first bin"; 00211 d.unit = "bpm"; 00212 d.hasFixedBinCount = false; 00213 list.push_back(d); 00214 00215 d.identifier = "detectionfunction"; 00216 d.name = "Detection Function"; 00217 d.description = "Onset detection function"; 00218 d.unit = ""; 00219 d.hasFixedBinCount = 1; 00220 d.binCount = 1; 00221 d.hasKnownExtents = true; 00222 d.minValue = 0.0; 00223 d.maxValue = 1.0; 00224 d.isQuantized = false; 00225 d.quantizeStep = 0.0; 00226 d.sampleType = OutputDescriptor::FixedSampleRate; 00227 if (m_stepSize) { 00228 d.sampleRate = m_inputSampleRate / m_stepSize; 00229 } else { 00230 d.sampleRate = m_inputSampleRate / (getPreferredBlockSize()/2); 00231 } 00232 d.hasDuration = false; 00233 list.push_back(d); 00234 00235 d.identifier = "acf"; 00236 d.name = "Autocorrelation Function"; 00237 d.description = "Autocorrelation of onset detection function"; 00238 d.hasKnownExtents = false; 00239 d.unit = "r"; 00240 list.push_back(d); 00241 00242 d.identifier = "filtered_acf"; 00243 d.name = "Filtered Autocorrelation"; 00244 d.description = "Filtered autocorrelation of onset detection function"; 00245 d.unit = "r"; 00246 list.push_back(d); 00247 00248 return list; 00249 } 00250 00251 bool 00252 FixedTempoEstimator::D::initialise(size_t, size_t stepSize, size_t blockSize) 00253 { 00254 m_stepSize = stepSize; 00255 m_blockSize = blockSize; 00256 00257 float dfLengthSecs = m_maxdflen; 00258 m_dfsize = (dfLengthSecs * m_inputSampleRate) / m_stepSize; 00259 00260 m_priorMagnitudes = new float[m_blockSize/2]; 00261 m_df = new float[m_dfsize]; 00262 00263 for (size_t i = 0; i < m_blockSize/2; ++i) { 00264 m_priorMagnitudes[i] = 0.f; 00265 } 00266 for (size_t i = 0; i < m_dfsize; ++i) { 00267 m_df[i] = 0.f; 00268 } 00269 00270 m_n = 0; 00271 00272 return true; 00273 } 00274 00275 void 00276 FixedTempoEstimator::D::reset() 00277 { 00278 if (!m_priorMagnitudes) return; 00279 00280 for (size_t i = 0; i < m_blockSize/2; ++i) { 00281 m_priorMagnitudes[i] = 0.f; 00282 } 00283 for (size_t i = 0; i < m_dfsize; ++i) { 00284 m_df[i] = 0.f; 00285 } 00286 00287 delete[] m_r; 00288 m_r = 0; 00289 00290 delete[] m_fr; 00291 m_fr = 0; 00292 00293 delete[] m_t; 00294 m_t = 0; 00295 00296 m_n = 0; 00297 00298 m_start = RealTime::zeroTime; 00299 m_lasttime = RealTime::zeroTime; 00300 } 00301 00302 FixedTempoEstimator::FeatureSet 00303 FixedTempoEstimator::D::process(const float *const *inputBuffers, RealTime ts) 00304 { 00305 FeatureSet fs; 00306 00307 if (m_stepSize == 0) { 00308 cerr << "ERROR: FixedTempoEstimator::process: " 00309 << "FixedTempoEstimator has not been initialised" 00310 << endl; 00311 return fs; 00312 } 00313 00314 if (m_n == 0) m_start = ts; 00315 m_lasttime = ts; 00316 00317 if (m_n == m_dfsize) { 00318 // If we have seen enough input, do the estimation and return 00319 calculate(); 00320 fs = assembleFeatures(); 00321 ++m_n; 00322 return fs; 00323 } 00324 00325 // If we have seen more than enough, just discard and return! 00326 if (m_n > m_dfsize) return FeatureSet(); 00327 00328 float value = 0.f; 00329 00330 // m_df will contain an onset detection function based on the rise 00331 // in overall power from one spectral frame to the next -- 00332 // simplistic but reasonably effective for our purposes. 00333 00334 for (size_t i = 1; i < m_blockSize/2; ++i) { 00335 00336 float real = inputBuffers[0][i*2]; 00337 float imag = inputBuffers[0][i*2 + 1]; 00338 00339 float sqrmag = real * real + imag * imag; 00340 value += fabsf(sqrmag - m_priorMagnitudes[i]); 00341 00342 m_priorMagnitudes[i] = sqrmag; 00343 } 00344 00345 m_df[m_n] = value; 00346 00347 ++m_n; 00348 return fs; 00349 } 00350 00351 FixedTempoEstimator::FeatureSet 00352 FixedTempoEstimator::D::getRemainingFeatures() 00353 { 00354 FeatureSet fs; 00355 if (m_n > m_dfsize) return fs; 00356 calculate(); 00357 fs = assembleFeatures(); 00358 ++m_n; 00359 return fs; 00360 } 00361 00362 float 00363 FixedTempoEstimator::D::lag2tempo(int lag) 00364 { 00365 return 60.f / ((lag * m_stepSize) / m_inputSampleRate); 00366 } 00367 00368 int 00369 FixedTempoEstimator::D::tempo2lag(float tempo) 00370 { 00371 return ((60.f / tempo) * m_inputSampleRate) / m_stepSize; 00372 } 00373 00374 void 00375 FixedTempoEstimator::D::calculate() 00376 { 00377 if (m_r) { 00378 cerr << "FixedTempoEstimator::calculate: calculation already happened?" << endl; 00379 return; 00380 } 00381 00382 if (m_n < m_dfsize / 9 && 00383 m_n < (1.0 * m_inputSampleRate) / m_stepSize) { // 1 second 00384 cerr << "FixedTempoEstimator::calculate: Input is too short" << endl; 00385 return; 00386 } 00387 00388 // This function takes m_df (the detection function array filled 00389 // out in process()) and calculates m_r (the raw autocorrelation) 00390 // and m_fr (the filtered autocorrelation from whose peaks tempo 00391 // estimates will be taken). 00392 00393 int n = m_n; // length of actual df array (m_dfsize is the theoretical max) 00394 00395 m_r = new float[n/2]; // raw autocorrelation 00396 m_fr = new float[n/2]; // filtered autocorrelation 00397 m_t = new float[n/2]; // averaged tempo estimate for each lag value 00398 00399 for (int i = 0; i < n/2; ++i) { 00400 m_r[i] = 0.f; 00401 m_fr[i] = 0.f; 00402 m_t[i] = lag2tempo(i); 00403 } 00404 00405 // Calculate the raw autocorrelation of the detection function 00406 00407 for (int i = 0; i < n/2; ++i) { 00408 00409 for (int j = i; j < n; ++j) { 00410 m_r[i] += m_df[j] * m_df[j - i]; 00411 } 00412 00413 m_r[i] /= n - i - 1; 00414 } 00415 00416 // Filter the autocorrelation and average out the tempo estimates 00417 00418 float related[] = { 0.5, 2, 4, 8 }; 00419 00420 for (int i = 1; i < n/2-1; ++i) { 00421 00422 m_fr[i] = m_r[i]; 00423 00424 int div = 1; 00425 00426 for (int j = 0; j < int(sizeof(related)/sizeof(related[0])); ++j) { 00427 00428 // Check for an obvious peak at each metrically related lag 00429 00430 int k0 = int(i * related[j] + 0.5); 00431 00432 if (k0 >= 0 && k0 < int(n/2)) { 00433 00434 int kmax = 0, kmin = 0; 00435 float kvmax = 0, kvmin = 0; 00436 bool have = false; 00437 00438 for (int k = k0 - 1; k <= k0 + 1; ++k) { 00439 00440 if (k < 0 || k >= n/2) continue; 00441 00442 if (!have || (m_r[k] > kvmax)) { kmax = k; kvmax = m_r[k]; } 00443 if (!have || (m_r[k] < kvmin)) { kmin = k; kvmin = m_r[k]; } 00444 00445 have = true; 00446 } 00447 00448 // Boost the original lag according to the strongest 00449 // value found close to this related lag 00450 00451 m_fr[i] += m_r[kmax] / 5; 00452 00453 if ((kmax == 0 || m_r[kmax] > m_r[kmax-1]) && 00454 (kmax == n/2-1 || m_r[kmax] > m_r[kmax+1]) && 00455 kvmax > kvmin * 1.05) { 00456 00457 // The strongest value close to the related lag is 00458 // also a pretty good looking peak, so use it to 00459 // improve our tempo estimate for the original lag 00460 00461 m_t[i] = m_t[i] + lag2tempo(kmax) * related[j]; 00462 ++div; 00463 } 00464 } 00465 } 00466 00467 m_t[i] /= div; 00468 00469 // Finally apply a primitive perceptual weighting (to prefer 00470 // tempi of around 120-130) 00471 00472 float weight = 1.f - fabsf(128.f - lag2tempo(i)) * 0.005; 00473 if (weight < 0.f) weight = 0.f; 00474 weight = weight * weight * weight; 00475 00476 m_fr[i] += m_fr[i] * (weight / 3); 00477 } 00478 } 00479 00480 FixedTempoEstimator::FeatureSet 00481 FixedTempoEstimator::D::assembleFeatures() 00482 { 00483 FeatureSet fs; 00484 if (!m_r) return fs; // No autocorrelation: no results 00485 00486 Feature feature; 00487 feature.hasTimestamp = true; 00488 feature.hasDuration = false; 00489 feature.label = ""; 00490 feature.values.clear(); 00491 feature.values.push_back(0.f); 00492 00493 char buffer[40]; 00494 00495 int n = m_n; 00496 00497 for (int i = 0; i < n; ++i) { 00498 00499 // Return the detection function in the DF output 00500 00501 feature.timestamp = m_start + 00502 RealTime::frame2RealTime(i * m_stepSize, m_inputSampleRate); 00503 feature.values[0] = m_df[i]; 00504 feature.label = ""; 00505 fs[DFOutput].push_back(feature); 00506 } 00507 00508 for (int i = 1; i < n/2; ++i) { 00509 00510 // Return the raw autocorrelation in the ACF output, each 00511 // value labelled according to its corresponding tempo 00512 00513 feature.timestamp = m_start + 00514 RealTime::frame2RealTime(i * m_stepSize, m_inputSampleRate); 00515 feature.values[0] = m_r[i]; 00516 sprintf(buffer, "%.1f bpm", lag2tempo(i)); 00517 if (i == n/2-1) feature.label = ""; 00518 else feature.label = buffer; 00519 fs[ACFOutput].push_back(feature); 00520 } 00521 00522 float t0 = m_minbpm; // our minimum detected tempo 00523 float t1 = m_maxbpm; // our maximum detected tempo 00524 00525 int p0 = tempo2lag(t1); 00526 int p1 = tempo2lag(t0); 00527 00528 std::map<float, int> candidates; 00529 00530 for (int i = p0; i <= p1 && i+1 < n/2; ++i) { 00531 00532 if (m_fr[i] > m_fr[i-1] && 00533 m_fr[i] > m_fr[i+1]) { 00534 00535 // This is a peak in the filtered autocorrelation: stick 00536 // it into the map from filtered autocorrelation to lag 00537 // index -- this sorts our peaks by filtered acf value 00538 00539 candidates[m_fr[i]] = i; 00540 } 00541 00542 // Also return the filtered autocorrelation in its own output 00543 00544 feature.timestamp = m_start + 00545 RealTime::frame2RealTime(i * m_stepSize, m_inputSampleRate); 00546 feature.values[0] = m_fr[i]; 00547 sprintf(buffer, "%.1f bpm", lag2tempo(i)); 00548 if (i == p1 || i == n/2-2) feature.label = ""; 00549 else feature.label = buffer; 00550 fs[FilteredACFOutput].push_back(feature); 00551 } 00552 00553 if (candidates.empty()) { 00554 cerr << "No tempo candidates!" << endl; 00555 return fs; 00556 } 00557 00558 feature.hasTimestamp = true; 00559 feature.timestamp = m_start; 00560 00561 feature.hasDuration = true; 00562 feature.duration = m_lasttime - m_start; 00563 00564 // The map contains only peaks and is sorted by filtered acf 00565 // value, so the final element in it is our "best" tempo guess 00566 00567 std::map<float, int>::const_iterator ci = candidates.end(); 00568 --ci; 00569 int maxpi = ci->second; 00570 00571 if (m_t[maxpi] > 0) { 00572 00573 // This lag has an adjusted tempo from the averaging process: 00574 // use it 00575 00576 feature.values[0] = m_t[maxpi]; 00577 00578 } else { 00579 00580 // shouldn't happen -- it would imply that this high value was 00581 // not a peak! 00582 00583 feature.values[0] = lag2tempo(maxpi); 00584 cerr << "WARNING: No stored tempo for index " << maxpi << endl; 00585 } 00586 00587 sprintf(buffer, "%.1f bpm", feature.values[0]); 00588 feature.label = buffer; 00589 00590 // Return the best tempo in the main output 00591 00592 fs[TempoOutput].push_back(feature); 00593 00594 // And return the other estimates (up to the arbitrarily chosen 00595 // number of 10 of them) in the candidates output 00596 00597 feature.values.clear(); 00598 feature.label = ""; 00599 00600 while (feature.values.size() < 10) { 00601 if (m_t[ci->second] > 0) { 00602 feature.values.push_back(m_t[ci->second]); 00603 } else { 00604 feature.values.push_back(lag2tempo(ci->second)); 00605 } 00606 if (ci == candidates.begin()) break; 00607 --ci; 00608 } 00609 00610 fs[CandidatesOutput].push_back(feature); 00611 00612 return fs; 00613 } 00614 00615 00616 00617 FixedTempoEstimator::FixedTempoEstimator(float inputSampleRate) : 00618 Plugin(inputSampleRate), 00619 m_d(new D(inputSampleRate)) 00620 { 00621 } 00622 00623 FixedTempoEstimator::~FixedTempoEstimator() 00624 { 00625 delete m_d; 00626 } 00627 00628 string 00629 FixedTempoEstimator::getIdentifier() const 00630 { 00631 return "fixedtempo"; 00632 } 00633 00634 string 00635 FixedTempoEstimator::getName() const 00636 { 00637 return "Simple Fixed Tempo Estimator"; 00638 } 00639 00640 string 00641 FixedTempoEstimator::getDescription() const 00642 { 00643 return "Study a short section of audio and estimate its tempo, assuming the tempo is constant"; 00644 } 00645 00646 string 00647 FixedTempoEstimator::getMaker() const 00648 { 00649 return "Vamp SDK Example Plugins"; 00650 } 00651 00652 int 00653 FixedTempoEstimator::getPluginVersion() const 00654 { 00655 return 1; 00656 } 00657 00658 string 00659 FixedTempoEstimator::getCopyright() const 00660 { 00661 return "Code copyright 2008 Queen Mary, University of London. Freely redistributable (BSD license)"; 00662 } 00663 00664 size_t 00665 FixedTempoEstimator::getPreferredStepSize() const 00666 { 00667 return m_d->getPreferredStepSize(); 00668 } 00669 00670 size_t 00671 FixedTempoEstimator::getPreferredBlockSize() const 00672 { 00673 return m_d->getPreferredBlockSize(); 00674 } 00675 00676 bool 00677 FixedTempoEstimator::initialise(size_t channels, size_t stepSize, size_t blockSize) 00678 { 00679 if (channels < getMinChannelCount() || 00680 channels > getMaxChannelCount()) return false; 00681 00682 return m_d->initialise(channels, stepSize, blockSize); 00683 } 00684 00685 void 00686 FixedTempoEstimator::reset() 00687 { 00688 return m_d->reset(); 00689 } 00690 00691 FixedTempoEstimator::ParameterList 00692 FixedTempoEstimator::getParameterDescriptors() const 00693 { 00694 return m_d->getParameterDescriptors(); 00695 } 00696 00697 float 00698 FixedTempoEstimator::getParameter(std::string id) const 00699 { 00700 return m_d->getParameter(id); 00701 } 00702 00703 void 00704 FixedTempoEstimator::setParameter(std::string id, float value) 00705 { 00706 m_d->setParameter(id, value); 00707 } 00708 00709 FixedTempoEstimator::OutputList 00710 FixedTempoEstimator::getOutputDescriptors() const 00711 { 00712 return m_d->getOutputDescriptors(); 00713 } 00714 00715 FixedTempoEstimator::FeatureSet 00716 FixedTempoEstimator::process(const float *const *inputBuffers, RealTime ts) 00717 { 00718 return m_d->process(inputBuffers, ts); 00719 } 00720 00721 FixedTempoEstimator::FeatureSet 00722 FixedTempoEstimator::getRemainingFeatures() 00723 { 00724 return m_d->getRemainingFeatures(); 00725 }