FixedTempoEstimator.cpp

Go to the documentation of this file.
00001 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
00002 
00003 /*
00004     Vamp
00005 
00006     An API for audio analysis and feature extraction plugins.
00007 
00008     Centre for Digital Music, Queen Mary, University of London.
00009     Copyright 2006-2009 Chris Cannam and QMUL.
00010   
00011     Permission is hereby granted, free of charge, to any person
00012     obtaining a copy of this software and associated documentation
00013     files (the "Software"), to deal in the Software without
00014     restriction, including without limitation the rights to use, copy,
00015     modify, merge, publish, distribute, sublicense, and/or sell copies
00016     of the Software, and to permit persons to whom the Software is
00017     furnished to do so, subject to the following conditions:
00018 
00019     The above copyright notice and this permission notice shall be
00020     included in all copies or substantial portions of the Software.
00021 
00022     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
00023     EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
00024     MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
00025     NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
00026     ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
00027     CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
00028     WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
00029 
00030     Except as contained in this notice, the names of the Centre for
00031     Digital Music; Queen Mary, University of London; and Chris Cannam
00032     shall not be used in advertising or otherwise to promote the sale,
00033     use or other dealings in this Software without prior written
00034     authorization.
00035 */
00036 
00037 #include "FixedTempoEstimator.h"
00038 
00039 using std::string;
00040 using std::vector;
00041 using std::cerr;
00042 using std::endl;
00043 
00044 using Vamp::RealTime;
00045 
00046 #include <cmath>
00047 
00048 
00049 class FixedTempoEstimator::D
00050 // this class just avoids us having to declare any data members in the header
00051 {
00052 public:
00053     D(float inputSampleRate);
00054     ~D();
00055 
00056     size_t getPreferredStepSize() const { return 64; }
00057     size_t getPreferredBlockSize() const { return 256; }
00058 
00059     ParameterList getParameterDescriptors() const;
00060     float getParameter(string id) const;
00061     void setParameter(string id, float value);
00062 
00063     OutputList getOutputDescriptors() const;
00064 
00065     bool initialise(size_t channels, size_t stepSize, size_t blockSize);
00066     void reset();
00067     FeatureSet process(const float *const *, RealTime);
00068     FeatureSet getRemainingFeatures();
00069 
00070 private:
00071     void calculate();
00072     FeatureSet assembleFeatures();
00073 
00074     float lag2tempo(int);
00075     int tempo2lag(float);
00076 
00077     float m_inputSampleRate;
00078     size_t m_stepSize;
00079     size_t m_blockSize;
00080 
00081     float m_minbpm;
00082     float m_maxbpm;
00083     float m_maxdflen;
00084 
00085     float *m_priorMagnitudes;
00086 
00087     size_t m_dfsize;
00088     float *m_df;
00089     float *m_r;
00090     float *m_fr;
00091     float *m_t;
00092     size_t m_n;
00093 
00094     Vamp::RealTime m_start;
00095     Vamp::RealTime m_lasttime;
00096 };
00097 
00098 FixedTempoEstimator::D::D(float inputSampleRate) :
00099     m_inputSampleRate(inputSampleRate),
00100     m_stepSize(0),
00101     m_blockSize(0),
00102     m_minbpm(50),
00103     m_maxbpm(190),
00104     m_maxdflen(10),
00105     m_priorMagnitudes(0),
00106     m_df(0),
00107     m_r(0),
00108     m_fr(0),
00109     m_t(0),
00110     m_n(0)
00111 {
00112 }
00113 
00114 FixedTempoEstimator::D::~D()
00115 {
00116     delete[] m_priorMagnitudes;
00117     delete[] m_df;
00118     delete[] m_r;
00119     delete[] m_fr;
00120     delete[] m_t;
00121 }
00122 
00123 FixedTempoEstimator::ParameterList
00124 FixedTempoEstimator::D::getParameterDescriptors() const
00125 {
00126     ParameterList list;
00127 
00128     ParameterDescriptor d;
00129     d.identifier = "minbpm";
00130     d.name = "Minimum estimated tempo";
00131     d.description = "Minimum beat-per-minute value which the tempo estimator is able to return";
00132     d.unit = "bpm";
00133     d.minValue = 10;
00134     d.maxValue = 360;
00135     d.defaultValue = 50;
00136     d.isQuantized = false;
00137     list.push_back(d);
00138 
00139     d.identifier = "maxbpm";
00140     d.name = "Maximum estimated tempo";
00141     d.description = "Maximum beat-per-minute value which the tempo estimator is able to return";
00142     d.defaultValue = 190;
00143     list.push_back(d);
00144 
00145     d.identifier = "maxdflen";
00146     d.name = "Input duration to study";
00147     d.description = "Length of audio input, in seconds, which should be taken into account when estimating tempo.  There is no need to supply the plugin with any further input once this time has elapsed since the start of the audio.  The tempo estimator may use only the first part of this, up to eight times the slowest beat duration: increasing this value further than that is unlikely to improve results.";
00148     d.unit = "s";
00149     d.minValue = 2;
00150     d.maxValue = 40;
00151     d.defaultValue = 10;
00152     list.push_back(d);
00153 
00154     return list;
00155 }
00156 
00157 float
00158 FixedTempoEstimator::D::getParameter(string id) const
00159 {
00160     if (id == "minbpm") {
00161         return m_minbpm;
00162     } else if (id == "maxbpm") {
00163         return m_maxbpm;
00164     } else if (id == "maxdflen") {
00165         return m_maxdflen;
00166     }
00167     return 0.f;
00168 }
00169 
00170 void
00171 FixedTempoEstimator::D::setParameter(string id, float value)
00172 {
00173     if (id == "minbpm") {
00174         m_minbpm = value;
00175     } else if (id == "maxbpm") {
00176         m_maxbpm = value;
00177     } else if (id == "maxdflen") {
00178         m_maxdflen = value;
00179     }
00180 }
00181 
00182 static int TempoOutput = 0;
00183 static int CandidatesOutput = 1;
00184 static int DFOutput = 2;
00185 static int ACFOutput = 3;
00186 static int FilteredACFOutput = 4;
00187 
00188 FixedTempoEstimator::OutputList
00189 FixedTempoEstimator::D::getOutputDescriptors() const
00190 {
00191     OutputList list;
00192 
00193     OutputDescriptor d;
00194     d.identifier = "tempo";
00195     d.name = "Tempo";
00196     d.description = "Estimated tempo";
00197     d.unit = "bpm";
00198     d.hasFixedBinCount = true;
00199     d.binCount = 1;
00200     d.hasKnownExtents = false;
00201     d.isQuantized = false;
00202     d.sampleType = OutputDescriptor::VariableSampleRate;
00203     d.sampleRate = m_inputSampleRate;
00204     d.hasDuration = true; // our returned tempo spans a certain range
00205     list.push_back(d);
00206 
00207     d.identifier = "candidates";
00208     d.name = "Tempo candidates";
00209     d.description = "Possible tempo estimates, one per bin with the most likely in the first bin";
00210     d.unit = "bpm";
00211     d.hasFixedBinCount = false;
00212     list.push_back(d);
00213 
00214     d.identifier = "detectionfunction";
00215     d.name = "Detection Function";
00216     d.description = "Onset detection function";
00217     d.unit = "";
00218     d.hasFixedBinCount = 1;
00219     d.binCount = 1;
00220     d.hasKnownExtents = true;
00221     d.minValue = 0.0;
00222     d.maxValue = 1.0;
00223     d.isQuantized = false;
00224     d.quantizeStep = 0.0;
00225     d.sampleType = OutputDescriptor::FixedSampleRate;
00226     if (m_stepSize) {
00227         d.sampleRate = m_inputSampleRate / m_stepSize;
00228     } else {
00229         d.sampleRate = m_inputSampleRate / (getPreferredBlockSize()/2);
00230     }
00231     d.hasDuration = false;
00232     list.push_back(d);
00233 
00234     d.identifier = "acf";
00235     d.name = "Autocorrelation Function";
00236     d.description = "Autocorrelation of onset detection function";
00237     d.hasKnownExtents = false;
00238     d.unit = "r";
00239     list.push_back(d);
00240 
00241     d.identifier = "filtered_acf";
00242     d.name = "Filtered Autocorrelation";
00243     d.description = "Filtered autocorrelation of onset detection function";
00244     d.unit = "r";
00245     list.push_back(d);
00246 
00247     return list;
00248 }
00249 
00250 bool
00251 FixedTempoEstimator::D::initialise(size_t, size_t stepSize, size_t blockSize)
00252 {
00253     m_stepSize = stepSize;
00254     m_blockSize = blockSize;
00255 
00256     float dfLengthSecs = m_maxdflen;
00257     m_dfsize = (dfLengthSecs * m_inputSampleRate) / m_stepSize;
00258 
00259     m_priorMagnitudes = new float[m_blockSize/2];
00260     m_df = new float[m_dfsize];
00261 
00262     for (size_t i = 0; i < m_blockSize/2; ++i) {
00263         m_priorMagnitudes[i] = 0.f;
00264     }
00265     for (size_t i = 0; i < m_dfsize; ++i) {
00266         m_df[i] = 0.f;
00267     }
00268 
00269     m_n = 0;
00270 
00271     return true;
00272 }
00273 
00274 void
00275 FixedTempoEstimator::D::reset()
00276 {
00277     if (!m_priorMagnitudes) return;
00278 
00279     for (size_t i = 0; i < m_blockSize/2; ++i) {
00280         m_priorMagnitudes[i] = 0.f;
00281     }
00282     for (size_t i = 0; i < m_dfsize; ++i) {
00283         m_df[i] = 0.f;
00284     }
00285 
00286     delete[] m_r;
00287     m_r = 0;
00288 
00289     delete[] m_fr; 
00290     m_fr = 0;
00291 
00292     delete[] m_t; 
00293     m_t = 0;
00294 
00295     m_n = 0;
00296 
00297     m_start = RealTime::zeroTime;
00298     m_lasttime = RealTime::zeroTime;
00299 }
00300 
00301 FixedTempoEstimator::FeatureSet
00302 FixedTempoEstimator::D::process(const float *const *inputBuffers, RealTime ts)
00303 {
00304     FeatureSet fs;
00305 
00306     if (m_stepSize == 0) {
00307         cerr << "ERROR: FixedTempoEstimator::process: "
00308              << "FixedTempoEstimator has not been initialised"
00309              << endl;
00310         return fs;
00311     }
00312 
00313     if (m_n == 0) m_start = ts;
00314     m_lasttime = ts;
00315 
00316     if (m_n == m_dfsize) {
00317         // If we have seen enough input, do the estimation and return
00318         calculate();
00319         fs = assembleFeatures();
00320         ++m_n;
00321         return fs;
00322     }
00323 
00324     // If we have seen more than enough, just discard and return!
00325     if (m_n > m_dfsize) return FeatureSet();
00326 
00327     float value = 0.f;
00328 
00329     // m_df will contain an onset detection function based on the rise
00330     // in overall power from one spectral frame to the next --
00331     // simplistic but reasonably effective for our purposes.
00332 
00333     for (size_t i = 1; i < m_blockSize/2; ++i) {
00334 
00335         float real = inputBuffers[0][i*2];
00336         float imag = inputBuffers[0][i*2 + 1];
00337 
00338         float sqrmag = real * real + imag * imag;
00339         value += fabsf(sqrmag - m_priorMagnitudes[i]);
00340 
00341         m_priorMagnitudes[i] = sqrmag;
00342     }
00343 
00344     m_df[m_n] = value;
00345 
00346     ++m_n;
00347     return fs;
00348 }    
00349 
00350 FixedTempoEstimator::FeatureSet
00351 FixedTempoEstimator::D::getRemainingFeatures()
00352 {
00353     FeatureSet fs;
00354     if (m_n > m_dfsize) return fs;
00355     calculate();
00356     fs = assembleFeatures();
00357     ++m_n;
00358     return fs;
00359 }
00360 
00361 float
00362 FixedTempoEstimator::D::lag2tempo(int lag)
00363 {
00364     return 60.f / ((lag * m_stepSize) / m_inputSampleRate);
00365 }
00366 
00367 int
00368 FixedTempoEstimator::D::tempo2lag(float tempo)
00369 {
00370     return ((60.f / tempo) * m_inputSampleRate) / m_stepSize;
00371 }
00372 
00373 void
00374 FixedTempoEstimator::D::calculate()
00375 {    
00376     if (m_r) {
00377         cerr << "FixedTempoEstimator::calculate: calculation already happened?" << endl;
00378         return;
00379     }
00380 
00381     if (m_n < m_dfsize / 9 &&
00382         m_n < (1.0 * m_inputSampleRate) / m_stepSize) { // 1 second
00383         cerr << "FixedTempoEstimator::calculate: Input is too short" << endl;
00384         return;
00385     }
00386 
00387     // This function takes m_df (the detection function array filled
00388     // out in process()) and calculates m_r (the raw autocorrelation)
00389     // and m_fr (the filtered autocorrelation from whose peaks tempo
00390     // estimates will be taken).
00391 
00392     int n = m_n; // length of actual df array (m_dfsize is the theoretical max)
00393 
00394     m_r  = new float[n/2]; // raw autocorrelation
00395     m_fr = new float[n/2]; // filtered autocorrelation
00396     m_t  = new float[n/2]; // averaged tempo estimate for each lag value
00397 
00398     for (int i = 0; i < n/2; ++i) {
00399         m_r[i]  = 0.f;
00400         m_fr[i] = 0.f;
00401         m_t[i]  = lag2tempo(i);
00402     }
00403 
00404     // Calculate the raw autocorrelation of the detection function
00405 
00406     for (int i = 0; i < n/2; ++i) {
00407 
00408         for (int j = i; j < n; ++j) {
00409             m_r[i] += m_df[j] * m_df[j - i];
00410         }
00411 
00412         m_r[i] /= n - i - 1;
00413     }
00414 
00415     // Filter the autocorrelation and average out the tempo estimates
00416     
00417     float related[] = { 0.5, 2, 4, 8 };
00418 
00419     for (int i = 1; i < n/2-1; ++i) {
00420 
00421         m_fr[i] = m_r[i];
00422 
00423         int div = 1;
00424 
00425         for (int j = 0; j < int(sizeof(related)/sizeof(related[0])); ++j) {
00426 
00427             // Check for an obvious peak at each metrically related lag
00428 
00429             int k0 = int(i * related[j] + 0.5);
00430 
00431             if (k0 >= 0 && k0 < int(n/2)) {
00432 
00433                 int kmax = 0, kmin = 0;
00434                 float kvmax = 0, kvmin = 0;
00435                 bool have = false;
00436 
00437                 for (int k = k0 - 1; k <= k0 + 1; ++k) {
00438 
00439                     if (k < 0 || k >= n/2) continue;
00440 
00441                     if (!have || (m_r[k] > kvmax)) { kmax = k; kvmax = m_r[k]; }
00442                     if (!have || (m_r[k] < kvmin)) { kmin = k; kvmin = m_r[k]; }
00443                     
00444                     have = true;
00445                 }
00446                 
00447                 // Boost the original lag according to the strongest
00448                 // value found close to this related lag
00449 
00450                 m_fr[i] += m_r[kmax] / 5;
00451 
00452                 if ((kmax == 0 || m_r[kmax] > m_r[kmax-1]) &&
00453                     (kmax == n/2-1 || m_r[kmax] > m_r[kmax+1]) &&
00454                     kvmax > kvmin * 1.05) {
00455 
00456                     // The strongest value close to the related lag is
00457                     // also a pretty good looking peak, so use it to
00458                     // improve our tempo estimate for the original lag
00459                     
00460                     m_t[i] = m_t[i] + lag2tempo(kmax) * related[j];
00461                     ++div;
00462                 }
00463             }
00464         }
00465         
00466         m_t[i] /= div;
00467         
00468         // Finally apply a primitive perceptual weighting (to prefer
00469         // tempi of around 120-130)
00470 
00471         float weight = 1.f - fabsf(128.f - lag2tempo(i)) * 0.005;
00472         if (weight < 0.f) weight = 0.f;
00473         weight = weight * weight * weight;
00474 
00475         m_fr[i] += m_fr[i] * (weight / 3);
00476     }
00477 }
00478     
00479 FixedTempoEstimator::FeatureSet
00480 FixedTempoEstimator::D::assembleFeatures()
00481 {
00482     FeatureSet fs;
00483     if (!m_r) return fs; // No autocorrelation: no results
00484 
00485     Feature feature;
00486     feature.hasTimestamp = true;
00487     feature.hasDuration = false;
00488     feature.label = "";
00489     feature.values.clear();
00490     feature.values.push_back(0.f);
00491 
00492     char buffer[40];
00493 
00494     int n = m_n;
00495 
00496     for (int i = 0; i < n; ++i) {
00497 
00498         // Return the detection function in the DF output
00499 
00500         feature.timestamp = m_start +
00501             RealTime::frame2RealTime(i * m_stepSize, m_inputSampleRate);
00502         feature.values[0] = m_df[i];
00503         feature.label = "";
00504         fs[DFOutput].push_back(feature);
00505     }
00506 
00507     for (int i = 1; i < n/2; ++i) {
00508 
00509         // Return the raw autocorrelation in the ACF output, each
00510         // value labelled according to its corresponding tempo
00511 
00512         feature.timestamp = m_start +
00513             RealTime::frame2RealTime(i * m_stepSize, m_inputSampleRate);
00514         feature.values[0] = m_r[i];
00515         sprintf(buffer, "%.1f bpm", lag2tempo(i));
00516         if (i == n/2-1) feature.label = "";
00517         else feature.label = buffer;
00518         fs[ACFOutput].push_back(feature);
00519     }
00520 
00521     float t0 = m_minbpm; // our minimum detected tempo
00522     float t1 = m_maxbpm; // our maximum detected tempo
00523 
00524     int p0 = tempo2lag(t1);
00525     int p1 = tempo2lag(t0);
00526 
00527     std::map<float, int> candidates;
00528 
00529     for (int i = p0; i <= p1 && i+1 < n/2; ++i) {
00530 
00531         if (m_fr[i] > m_fr[i-1] &&
00532             m_fr[i] > m_fr[i+1]) {
00533 
00534             // This is a peak in the filtered autocorrelation: stick
00535             // it into the map from filtered autocorrelation to lag
00536             // index -- this sorts our peaks by filtered acf value
00537 
00538             candidates[m_fr[i]] = i;
00539         }
00540 
00541         // Also return the filtered autocorrelation in its own output
00542 
00543         feature.timestamp = m_start +
00544             RealTime::frame2RealTime(i * m_stepSize, m_inputSampleRate);
00545         feature.values[0] = m_fr[i];
00546         sprintf(buffer, "%.1f bpm", lag2tempo(i));
00547         if (i == p1 || i == n/2-2) feature.label = "";
00548         else feature.label = buffer;
00549         fs[FilteredACFOutput].push_back(feature);
00550     }
00551 
00552     if (candidates.empty()) {
00553         cerr << "No tempo candidates!" << endl;
00554         return fs;
00555     }
00556 
00557     feature.hasTimestamp = true;
00558     feature.timestamp = m_start;
00559     
00560     feature.hasDuration = true;
00561     feature.duration = m_lasttime - m_start;
00562 
00563     // The map contains only peaks and is sorted by filtered acf
00564     // value, so the final element in it is our "best" tempo guess
00565 
00566     std::map<float, int>::const_iterator ci = candidates.end();
00567     --ci;
00568     int maxpi = ci->second;
00569 
00570     if (m_t[maxpi] > 0) {
00571 
00572         // This lag has an adjusted tempo from the averaging process:
00573         // use it
00574 
00575         feature.values[0] = m_t[maxpi];
00576 
00577     } else {
00578 
00579         // shouldn't happen -- it would imply that this high value was
00580         // not a peak!
00581 
00582         feature.values[0] = lag2tempo(maxpi);
00583         cerr << "WARNING: No stored tempo for index " << maxpi << endl;
00584     }
00585 
00586     sprintf(buffer, "%.1f bpm", feature.values[0]);
00587     feature.label = buffer;
00588 
00589     // Return the best tempo in the main output
00590 
00591     fs[TempoOutput].push_back(feature);
00592 
00593     // And return the other estimates (up to the arbitrarily chosen
00594     // number of 10 of them) in the candidates output
00595 
00596     feature.values.clear();
00597     feature.label = "";
00598 
00599     while (feature.values.size() < 10) {
00600         if (m_t[ci->second] > 0) {
00601             feature.values.push_back(m_t[ci->second]);
00602         } else {
00603             feature.values.push_back(lag2tempo(ci->second));
00604         }
00605         if (ci == candidates.begin()) break;
00606         --ci;
00607     }
00608 
00609     fs[CandidatesOutput].push_back(feature);
00610     
00611     return fs;
00612 }
00613 
00614     
00615 
00616 FixedTempoEstimator::FixedTempoEstimator(float inputSampleRate) :
00617     Plugin(inputSampleRate),
00618     m_d(new D(inputSampleRate))
00619 {
00620 }
00621 
00622 FixedTempoEstimator::~FixedTempoEstimator()
00623 {
00624     delete m_d;
00625 }
00626 
00627 string
00628 FixedTempoEstimator::getIdentifier() const
00629 {
00630     return "fixedtempo";
00631 }
00632 
00633 string
00634 FixedTempoEstimator::getName() const
00635 {
00636     return "Simple Fixed Tempo Estimator";
00637 }
00638 
00639 string
00640 FixedTempoEstimator::getDescription() const
00641 {
00642     return "Study a short section of audio and estimate its tempo, assuming the tempo is constant";
00643 }
00644 
00645 string
00646 FixedTempoEstimator::getMaker() const
00647 {
00648     return "Vamp SDK Example Plugins";
00649 }
00650 
00651 int
00652 FixedTempoEstimator::getPluginVersion() const
00653 {
00654     return 1;
00655 }
00656 
00657 string
00658 FixedTempoEstimator::getCopyright() const
00659 {
00660     return "Code copyright 2008 Queen Mary, University of London.  Freely redistributable (BSD license)";
00661 }
00662 
00663 size_t
00664 FixedTempoEstimator::getPreferredStepSize() const
00665 {
00666     return m_d->getPreferredStepSize();
00667 }
00668 
00669 size_t
00670 FixedTempoEstimator::getPreferredBlockSize() const
00671 {
00672     return m_d->getPreferredBlockSize();
00673 }
00674 
00675 bool
00676 FixedTempoEstimator::initialise(size_t channels, size_t stepSize, size_t blockSize)
00677 {
00678     if (channels < getMinChannelCount() ||
00679         channels > getMaxChannelCount()) return false;
00680 
00681     return m_d->initialise(channels, stepSize, blockSize);
00682 }
00683 
00684 void
00685 FixedTempoEstimator::reset()
00686 {
00687     return m_d->reset();
00688 }
00689 
00690 FixedTempoEstimator::ParameterList
00691 FixedTempoEstimator::getParameterDescriptors() const
00692 {
00693     return m_d->getParameterDescriptors();
00694 }
00695 
00696 float
00697 FixedTempoEstimator::getParameter(std::string id) const
00698 {
00699     return m_d->getParameter(id);
00700 }
00701 
00702 void
00703 FixedTempoEstimator::setParameter(std::string id, float value)
00704 {
00705     m_d->setParameter(id, value);
00706 }
00707 
00708 FixedTempoEstimator::OutputList
00709 FixedTempoEstimator::getOutputDescriptors() const
00710 {
00711     return m_d->getOutputDescriptors();
00712 }
00713 
00714 FixedTempoEstimator::FeatureSet
00715 FixedTempoEstimator::process(const float *const *inputBuffers, RealTime ts)
00716 {
00717     return m_d->process(inputBuffers, ts);
00718 }
00719 
00720 FixedTempoEstimator::FeatureSet
00721 FixedTempoEstimator::getRemainingFeatures()
00722 {
00723     return m_d->getRemainingFeatures();
00724 }