VampPluginSDK  2.1
FixedTempoEstimator.cpp
Go to the documentation of this file.
00001 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
00002 
00003 /*
00004     Vamp
00005 
00006     An API for audio analysis and feature extraction plugins.
00007 
00008     Centre for Digital Music, Queen Mary, University of London.
00009     Copyright 2006-2009 Chris Cannam and QMUL.
00010   
00011     Permission is hereby granted, free of charge, to any person
00012     obtaining a copy of this software and associated documentation
00013     files (the "Software"), to deal in the Software without
00014     restriction, including without limitation the rights to use, copy,
00015     modify, merge, publish, distribute, sublicense, and/or sell copies
00016     of the Software, and to permit persons to whom the Software is
00017     furnished to do so, subject to the following conditions:
00018 
00019     The above copyright notice and this permission notice shall be
00020     included in all copies or substantial portions of the Software.
00021 
00022     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
00023     EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
00024     MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
00025     NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
00026     ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
00027     CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
00028     WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
00029 
00030     Except as contained in this notice, the names of the Centre for
00031     Digital Music; Queen Mary, University of London; and Chris Cannam
00032     shall not be used in advertising or otherwise to promote the sale,
00033     use or other dealings in this Software without prior written
00034     authorization.
00035 */
00036 
00037 #include "FixedTempoEstimator.h"
00038 
00039 using std::string;
00040 using std::vector;
00041 using std::cerr;
00042 using std::endl;
00043 
00044 using Vamp::RealTime;
00045 
00046 #include <cmath>
00047 #include <cstdio>
00048 
00049 
00050 class FixedTempoEstimator::D
00051 // this class just avoids us having to declare any data members in the header
00052 {
00053 public:
00054     D(float inputSampleRate);
00055     ~D();
00056 
00057     size_t getPreferredStepSize() const { return 64; }
00058     size_t getPreferredBlockSize() const { return 256; }
00059 
00060     ParameterList getParameterDescriptors() const;
00061     float getParameter(string id) const;
00062     void setParameter(string id, float value);
00063 
00064     OutputList getOutputDescriptors() const;
00065 
00066     bool initialise(size_t channels, size_t stepSize, size_t blockSize);
00067     void reset();
00068     FeatureSet process(const float *const *, RealTime);
00069     FeatureSet getRemainingFeatures();
00070 
00071 private:
00072     void calculate();
00073     FeatureSet assembleFeatures();
00074 
00075     float lag2tempo(int);
00076     int tempo2lag(float);
00077 
00078     float m_inputSampleRate;
00079     size_t m_stepSize;
00080     size_t m_blockSize;
00081 
00082     float m_minbpm;
00083     float m_maxbpm;
00084     float m_maxdflen;
00085 
00086     float *m_priorMagnitudes;
00087 
00088     size_t m_dfsize;
00089     float *m_df;
00090     float *m_r;
00091     float *m_fr;
00092     float *m_t;
00093     size_t m_n;
00094 
00095     Vamp::RealTime m_start;
00096     Vamp::RealTime m_lasttime;
00097 };
00098 
00099 FixedTempoEstimator::D::D(float inputSampleRate) :
00100     m_inputSampleRate(inputSampleRate),
00101     m_stepSize(0),
00102     m_blockSize(0),
00103     m_minbpm(50),
00104     m_maxbpm(190),
00105     m_maxdflen(10),
00106     m_priorMagnitudes(0),
00107     m_df(0),
00108     m_r(0),
00109     m_fr(0),
00110     m_t(0),
00111     m_n(0)
00112 {
00113 }
00114 
00115 FixedTempoEstimator::D::~D()
00116 {
00117     delete[] m_priorMagnitudes;
00118     delete[] m_df;
00119     delete[] m_r;
00120     delete[] m_fr;
00121     delete[] m_t;
00122 }
00123 
00124 FixedTempoEstimator::ParameterList
00125 FixedTempoEstimator::D::getParameterDescriptors() const
00126 {
00127     ParameterList list;
00128 
00129     ParameterDescriptor d;
00130     d.identifier = "minbpm";
00131     d.name = "Minimum estimated tempo";
00132     d.description = "Minimum beat-per-minute value which the tempo estimator is able to return";
00133     d.unit = "bpm";
00134     d.minValue = 10;
00135     d.maxValue = 360;
00136     d.defaultValue = 50;
00137     d.isQuantized = false;
00138     list.push_back(d);
00139 
00140     d.identifier = "maxbpm";
00141     d.name = "Maximum estimated tempo";
00142     d.description = "Maximum beat-per-minute value which the tempo estimator is able to return";
00143     d.defaultValue = 190;
00144     list.push_back(d);
00145 
00146     d.identifier = "maxdflen";
00147     d.name = "Input duration to study";
00148     d.description = "Length of audio input, in seconds, which should be taken into account when estimating tempo.  There is no need to supply the plugin with any further input once this time has elapsed since the start of the audio.  The tempo estimator may use only the first part of this, up to eight times the slowest beat duration: increasing this value further than that is unlikely to improve results.";
00149     d.unit = "s";
00150     d.minValue = 2;
00151     d.maxValue = 40;
00152     d.defaultValue = 10;
00153     list.push_back(d);
00154 
00155     return list;
00156 }
00157 
00158 float
00159 FixedTempoEstimator::D::getParameter(string id) const
00160 {
00161     if (id == "minbpm") {
00162         return m_minbpm;
00163     } else if (id == "maxbpm") {
00164         return m_maxbpm;
00165     } else if (id == "maxdflen") {
00166         return m_maxdflen;
00167     }
00168     return 0.f;
00169 }
00170 
00171 void
00172 FixedTempoEstimator::D::setParameter(string id, float value)
00173 {
00174     if (id == "minbpm") {
00175         m_minbpm = value;
00176     } else if (id == "maxbpm") {
00177         m_maxbpm = value;
00178     } else if (id == "maxdflen") {
00179         m_maxdflen = value;
00180     }
00181 }
00182 
00183 static int TempoOutput = 0;
00184 static int CandidatesOutput = 1;
00185 static int DFOutput = 2;
00186 static int ACFOutput = 3;
00187 static int FilteredACFOutput = 4;
00188 
00189 FixedTempoEstimator::OutputList
00190 FixedTempoEstimator::D::getOutputDescriptors() const
00191 {
00192     OutputList list;
00193 
00194     OutputDescriptor d;
00195     d.identifier = "tempo";
00196     d.name = "Tempo";
00197     d.description = "Estimated tempo";
00198     d.unit = "bpm";
00199     d.hasFixedBinCount = true;
00200     d.binCount = 1;
00201     d.hasKnownExtents = false;
00202     d.isQuantized = false;
00203     d.sampleType = OutputDescriptor::VariableSampleRate;
00204     d.sampleRate = m_inputSampleRate;
00205     d.hasDuration = true; // our returned tempo spans a certain range
00206     list.push_back(d);
00207 
00208     d.identifier = "candidates";
00209     d.name = "Tempo candidates";
00210     d.description = "Possible tempo estimates, one per bin with the most likely in the first bin";
00211     d.unit = "bpm";
00212     d.hasFixedBinCount = false;
00213     list.push_back(d);
00214 
00215     d.identifier = "detectionfunction";
00216     d.name = "Detection Function";
00217     d.description = "Onset detection function";
00218     d.unit = "";
00219     d.hasFixedBinCount = 1;
00220     d.binCount = 1;
00221     d.hasKnownExtents = true;
00222     d.minValue = 0.0;
00223     d.maxValue = 1.0;
00224     d.isQuantized = false;
00225     d.quantizeStep = 0.0;
00226     d.sampleType = OutputDescriptor::FixedSampleRate;
00227     if (m_stepSize) {
00228         d.sampleRate = m_inputSampleRate / m_stepSize;
00229     } else {
00230         d.sampleRate = m_inputSampleRate / (getPreferredBlockSize()/2);
00231     }
00232     d.hasDuration = false;
00233     list.push_back(d);
00234 
00235     d.identifier = "acf";
00236     d.name = "Autocorrelation Function";
00237     d.description = "Autocorrelation of onset detection function";
00238     d.hasKnownExtents = false;
00239     d.unit = "r";
00240     list.push_back(d);
00241 
00242     d.identifier = "filtered_acf";
00243     d.name = "Filtered Autocorrelation";
00244     d.description = "Filtered autocorrelation of onset detection function";
00245     d.unit = "r";
00246     list.push_back(d);
00247 
00248     return list;
00249 }
00250 
00251 bool
00252 FixedTempoEstimator::D::initialise(size_t, size_t stepSize, size_t blockSize)
00253 {
00254     m_stepSize = stepSize;
00255     m_blockSize = blockSize;
00256 
00257     float dfLengthSecs = m_maxdflen;
00258     m_dfsize = (dfLengthSecs * m_inputSampleRate) / m_stepSize;
00259 
00260     m_priorMagnitudes = new float[m_blockSize/2];
00261     m_df = new float[m_dfsize];
00262 
00263     for (size_t i = 0; i < m_blockSize/2; ++i) {
00264         m_priorMagnitudes[i] = 0.f;
00265     }
00266     for (size_t i = 0; i < m_dfsize; ++i) {
00267         m_df[i] = 0.f;
00268     }
00269 
00270     m_n = 0;
00271 
00272     return true;
00273 }
00274 
00275 void
00276 FixedTempoEstimator::D::reset()
00277 {
00278     if (!m_priorMagnitudes) return;
00279 
00280     for (size_t i = 0; i < m_blockSize/2; ++i) {
00281         m_priorMagnitudes[i] = 0.f;
00282     }
00283     for (size_t i = 0; i < m_dfsize; ++i) {
00284         m_df[i] = 0.f;
00285     }
00286 
00287     delete[] m_r;
00288     m_r = 0;
00289 
00290     delete[] m_fr; 
00291     m_fr = 0;
00292 
00293     delete[] m_t; 
00294     m_t = 0;
00295 
00296     m_n = 0;
00297 
00298     m_start = RealTime::zeroTime;
00299     m_lasttime = RealTime::zeroTime;
00300 }
00301 
00302 FixedTempoEstimator::FeatureSet
00303 FixedTempoEstimator::D::process(const float *const *inputBuffers, RealTime ts)
00304 {
00305     FeatureSet fs;
00306 
00307     if (m_stepSize == 0) {
00308         cerr << "ERROR: FixedTempoEstimator::process: "
00309              << "FixedTempoEstimator has not been initialised"
00310              << endl;
00311         return fs;
00312     }
00313 
00314     if (m_n == 0) m_start = ts;
00315     m_lasttime = ts;
00316 
00317     if (m_n == m_dfsize) {
00318         // If we have seen enough input, do the estimation and return
00319         calculate();
00320         fs = assembleFeatures();
00321         ++m_n;
00322         return fs;
00323     }
00324 
00325     // If we have seen more than enough, just discard and return!
00326     if (m_n > m_dfsize) return FeatureSet();
00327 
00328     float value = 0.f;
00329 
00330     // m_df will contain an onset detection function based on the rise
00331     // in overall power from one spectral frame to the next --
00332     // simplistic but reasonably effective for our purposes.
00333 
00334     for (size_t i = 1; i < m_blockSize/2; ++i) {
00335 
00336         float real = inputBuffers[0][i*2];
00337         float imag = inputBuffers[0][i*2 + 1];
00338 
00339         float sqrmag = real * real + imag * imag;
00340         value += fabsf(sqrmag - m_priorMagnitudes[i]);
00341 
00342         m_priorMagnitudes[i] = sqrmag;
00343     }
00344 
00345     m_df[m_n] = value;
00346 
00347     ++m_n;
00348     return fs;
00349 }    
00350 
00351 FixedTempoEstimator::FeatureSet
00352 FixedTempoEstimator::D::getRemainingFeatures()
00353 {
00354     FeatureSet fs;
00355     if (m_n > m_dfsize) return fs;
00356     calculate();
00357     fs = assembleFeatures();
00358     ++m_n;
00359     return fs;
00360 }
00361 
00362 float
00363 FixedTempoEstimator::D::lag2tempo(int lag)
00364 {
00365     return 60.f / ((lag * m_stepSize) / m_inputSampleRate);
00366 }
00367 
00368 int
00369 FixedTempoEstimator::D::tempo2lag(float tempo)
00370 {
00371     return ((60.f / tempo) * m_inputSampleRate) / m_stepSize;
00372 }
00373 
00374 void
00375 FixedTempoEstimator::D::calculate()
00376 {    
00377     if (m_r) {
00378         cerr << "FixedTempoEstimator::calculate: calculation already happened?" << endl;
00379         return;
00380     }
00381 
00382     if (m_n < m_dfsize / 9 &&
00383         m_n < (1.0 * m_inputSampleRate) / m_stepSize) { // 1 second
00384         cerr << "FixedTempoEstimator::calculate: Input is too short" << endl;
00385         return;
00386     }
00387 
00388     // This function takes m_df (the detection function array filled
00389     // out in process()) and calculates m_r (the raw autocorrelation)
00390     // and m_fr (the filtered autocorrelation from whose peaks tempo
00391     // estimates will be taken).
00392 
00393     int n = m_n; // length of actual df array (m_dfsize is the theoretical max)
00394 
00395     m_r  = new float[n/2]; // raw autocorrelation
00396     m_fr = new float[n/2]; // filtered autocorrelation
00397     m_t  = new float[n/2]; // averaged tempo estimate for each lag value
00398 
00399     for (int i = 0; i < n/2; ++i) {
00400         m_r[i]  = 0.f;
00401         m_fr[i] = 0.f;
00402         m_t[i]  = lag2tempo(i);
00403     }
00404 
00405     // Calculate the raw autocorrelation of the detection function
00406 
00407     for (int i = 0; i < n/2; ++i) {
00408 
00409         for (int j = i; j < n; ++j) {
00410             m_r[i] += m_df[j] * m_df[j - i];
00411         }
00412 
00413         m_r[i] /= n - i - 1;
00414     }
00415 
00416     // Filter the autocorrelation and average out the tempo estimates
00417     
00418     float related[] = { 0.5, 2, 4, 8 };
00419 
00420     for (int i = 1; i < n/2-1; ++i) {
00421 
00422         m_fr[i] = m_r[i];
00423 
00424         int div = 1;
00425 
00426         for (int j = 0; j < int(sizeof(related)/sizeof(related[0])); ++j) {
00427 
00428             // Check for an obvious peak at each metrically related lag
00429 
00430             int k0 = int(i * related[j] + 0.5);
00431 
00432             if (k0 >= 0 && k0 < int(n/2)) {
00433 
00434                 int kmax = 0, kmin = 0;
00435                 float kvmax = 0, kvmin = 0;
00436                 bool have = false;
00437 
00438                 for (int k = k0 - 1; k <= k0 + 1; ++k) {
00439 
00440                     if (k < 0 || k >= n/2) continue;
00441 
00442                     if (!have || (m_r[k] > kvmax)) { kmax = k; kvmax = m_r[k]; }
00443                     if (!have || (m_r[k] < kvmin)) { kmin = k; kvmin = m_r[k]; }
00444                     
00445                     have = true;
00446                 }
00447                 
00448                 // Boost the original lag according to the strongest
00449                 // value found close to this related lag
00450 
00451                 m_fr[i] += m_r[kmax] / 5;
00452 
00453                 if ((kmax == 0 || m_r[kmax] > m_r[kmax-1]) &&
00454                     (kmax == n/2-1 || m_r[kmax] > m_r[kmax+1]) &&
00455                     kvmax > kvmin * 1.05) {
00456 
00457                     // The strongest value close to the related lag is
00458                     // also a pretty good looking peak, so use it to
00459                     // improve our tempo estimate for the original lag
00460                     
00461                     m_t[i] = m_t[i] + lag2tempo(kmax) * related[j];
00462                     ++div;
00463                 }
00464             }
00465         }
00466         
00467         m_t[i] /= div;
00468         
00469         // Finally apply a primitive perceptual weighting (to prefer
00470         // tempi of around 120-130)
00471 
00472         float weight = 1.f - fabsf(128.f - lag2tempo(i)) * 0.005;
00473         if (weight < 0.f) weight = 0.f;
00474         weight = weight * weight * weight;
00475 
00476         m_fr[i] += m_fr[i] * (weight / 3);
00477     }
00478 }
00479     
00480 FixedTempoEstimator::FeatureSet
00481 FixedTempoEstimator::D::assembleFeatures()
00482 {
00483     FeatureSet fs;
00484     if (!m_r) return fs; // No autocorrelation: no results
00485 
00486     Feature feature;
00487     feature.hasTimestamp = true;
00488     feature.hasDuration = false;
00489     feature.label = "";
00490     feature.values.clear();
00491     feature.values.push_back(0.f);
00492 
00493     char buffer[40];
00494 
00495     int n = m_n;
00496 
00497     for (int i = 0; i < n; ++i) {
00498 
00499         // Return the detection function in the DF output
00500 
00501         feature.timestamp = m_start +
00502             RealTime::frame2RealTime(i * m_stepSize, m_inputSampleRate);
00503         feature.values[0] = m_df[i];
00504         feature.label = "";
00505         fs[DFOutput].push_back(feature);
00506     }
00507 
00508     for (int i = 1; i < n/2; ++i) {
00509 
00510         // Return the raw autocorrelation in the ACF output, each
00511         // value labelled according to its corresponding tempo
00512 
00513         feature.timestamp = m_start +
00514             RealTime::frame2RealTime(i * m_stepSize, m_inputSampleRate);
00515         feature.values[0] = m_r[i];
00516         sprintf(buffer, "%.1f bpm", lag2tempo(i));
00517         if (i == n/2-1) feature.label = "";
00518         else feature.label = buffer;
00519         fs[ACFOutput].push_back(feature);
00520     }
00521 
00522     float t0 = m_minbpm; // our minimum detected tempo
00523     float t1 = m_maxbpm; // our maximum detected tempo
00524 
00525     int p0 = tempo2lag(t1);
00526     int p1 = tempo2lag(t0);
00527 
00528     std::map<float, int> candidates;
00529 
00530     for (int i = p0; i <= p1 && i+1 < n/2; ++i) {
00531 
00532         if (m_fr[i] > m_fr[i-1] &&
00533             m_fr[i] > m_fr[i+1]) {
00534 
00535             // This is a peak in the filtered autocorrelation: stick
00536             // it into the map from filtered autocorrelation to lag
00537             // index -- this sorts our peaks by filtered acf value
00538 
00539             candidates[m_fr[i]] = i;
00540         }
00541 
00542         // Also return the filtered autocorrelation in its own output
00543 
00544         feature.timestamp = m_start +
00545             RealTime::frame2RealTime(i * m_stepSize, m_inputSampleRate);
00546         feature.values[0] = m_fr[i];
00547         sprintf(buffer, "%.1f bpm", lag2tempo(i));
00548         if (i == p1 || i == n/2-2) feature.label = "";
00549         else feature.label = buffer;
00550         fs[FilteredACFOutput].push_back(feature);
00551     }
00552 
00553     if (candidates.empty()) {
00554         cerr << "No tempo candidates!" << endl;
00555         return fs;
00556     }
00557 
00558     feature.hasTimestamp = true;
00559     feature.timestamp = m_start;
00560     
00561     feature.hasDuration = true;
00562     feature.duration = m_lasttime - m_start;
00563 
00564     // The map contains only peaks and is sorted by filtered acf
00565     // value, so the final element in it is our "best" tempo guess
00566 
00567     std::map<float, int>::const_iterator ci = candidates.end();
00568     --ci;
00569     int maxpi = ci->second;
00570 
00571     if (m_t[maxpi] > 0) {
00572 
00573         // This lag has an adjusted tempo from the averaging process:
00574         // use it
00575 
00576         feature.values[0] = m_t[maxpi];
00577 
00578     } else {
00579 
00580         // shouldn't happen -- it would imply that this high value was
00581         // not a peak!
00582 
00583         feature.values[0] = lag2tempo(maxpi);
00584         cerr << "WARNING: No stored tempo for index " << maxpi << endl;
00585     }
00586 
00587     sprintf(buffer, "%.1f bpm", feature.values[0]);
00588     feature.label = buffer;
00589 
00590     // Return the best tempo in the main output
00591 
00592     fs[TempoOutput].push_back(feature);
00593 
00594     // And return the other estimates (up to the arbitrarily chosen
00595     // number of 10 of them) in the candidates output
00596 
00597     feature.values.clear();
00598     feature.label = "";
00599 
00600     while (feature.values.size() < 10) {
00601         if (m_t[ci->second] > 0) {
00602             feature.values.push_back(m_t[ci->second]);
00603         } else {
00604             feature.values.push_back(lag2tempo(ci->second));
00605         }
00606         if (ci == candidates.begin()) break;
00607         --ci;
00608     }
00609 
00610     fs[CandidatesOutput].push_back(feature);
00611     
00612     return fs;
00613 }
00614 
00615     
00616 
00617 FixedTempoEstimator::FixedTempoEstimator(float inputSampleRate) :
00618     Plugin(inputSampleRate),
00619     m_d(new D(inputSampleRate))
00620 {
00621 }
00622 
00623 FixedTempoEstimator::~FixedTempoEstimator()
00624 {
00625     delete m_d;
00626 }
00627 
00628 string
00629 FixedTempoEstimator::getIdentifier() const
00630 {
00631     return "fixedtempo";
00632 }
00633 
00634 string
00635 FixedTempoEstimator::getName() const
00636 {
00637     return "Simple Fixed Tempo Estimator";
00638 }
00639 
00640 string
00641 FixedTempoEstimator::getDescription() const
00642 {
00643     return "Study a short section of audio and estimate its tempo, assuming the tempo is constant";
00644 }
00645 
00646 string
00647 FixedTempoEstimator::getMaker() const
00648 {
00649     return "Vamp SDK Example Plugins";
00650 }
00651 
00652 int
00653 FixedTempoEstimator::getPluginVersion() const
00654 {
00655     return 1;
00656 }
00657 
00658 string
00659 FixedTempoEstimator::getCopyright() const
00660 {
00661     return "Code copyright 2008 Queen Mary, University of London.  Freely redistributable (BSD license)";
00662 }
00663 
00664 size_t
00665 FixedTempoEstimator::getPreferredStepSize() const
00666 {
00667     return m_d->getPreferredStepSize();
00668 }
00669 
00670 size_t
00671 FixedTempoEstimator::getPreferredBlockSize() const
00672 {
00673     return m_d->getPreferredBlockSize();
00674 }
00675 
00676 bool
00677 FixedTempoEstimator::initialise(size_t channels, size_t stepSize, size_t blockSize)
00678 {
00679     if (channels < getMinChannelCount() ||
00680         channels > getMaxChannelCount()) return false;
00681 
00682     return m_d->initialise(channels, stepSize, blockSize);
00683 }
00684 
00685 void
00686 FixedTempoEstimator::reset()
00687 {
00688     return m_d->reset();
00689 }
00690 
00691 FixedTempoEstimator::ParameterList
00692 FixedTempoEstimator::getParameterDescriptors() const
00693 {
00694     return m_d->getParameterDescriptors();
00695 }
00696 
00697 float
00698 FixedTempoEstimator::getParameter(std::string id) const
00699 {
00700     return m_d->getParameter(id);
00701 }
00702 
00703 void
00704 FixedTempoEstimator::setParameter(std::string id, float value)
00705 {
00706     m_d->setParameter(id, value);
00707 }
00708 
00709 FixedTempoEstimator::OutputList
00710 FixedTempoEstimator::getOutputDescriptors() const
00711 {
00712     return m_d->getOutputDescriptors();
00713 }
00714 
00715 FixedTempoEstimator::FeatureSet
00716 FixedTempoEstimator::process(const float *const *inputBuffers, RealTime ts)
00717 {
00718     return m_d->process(inputBuffers, ts);
00719 }
00720 
00721 FixedTempoEstimator::FeatureSet
00722 FixedTempoEstimator::getRemainingFeatures()
00723 {
00724     return m_d->getRemainingFeatures();
00725 }