#include #include #include #include #include #include #include #include using namespace cv; using namespace std; class AudioDrawing { public: AudioDrawing(const CommandLineParser& parser) { if (!initAndCheckArgs(parser)) { cerr << "Error: Wrong input arguments" << endl; exit(0); } Draw(); } void Draw() { if (draw == "static") { vectorinputAudio = {}; int samplingRate = 0; if (inputType == "file") { samplingRate = readAudioFile(audio, inputAudio); } else if (inputType == "microphone") { samplingRate = readAudioMicrophone(inputAudio); } if ((inputAudio.size() == 0) || samplingRate <= 0) { cerr << "Error: problems with audio reading, check input arguments" << endl; return; } int duration = static_cast(inputAudio.size()) / samplingRate; // since the dimensional grid is counted in integer seconds, // if the input audio has an incomplete last second, // then it is filled with zeros to complete int remainder = static_cast(inputAudio.size()) % samplingRate; if (remainder) { int sizeToFullSec = samplingRate - remainder; for (int j = 0; j < sizeToFullSec; ++j) { inputAudio.push_back(0); } duration += 1; cout << "Update duration of audio to full last second with " << sizeToFullSec << " zero samples" << endl; cout << "New number of samples " << inputAudio.size() << endl; } cout << "Duration of audio = " << duration << " seconds" << endl; // since the dimensional grid is counted in integer seconds, // if duration of file is less than xmarkup, to avoid an incorrect display, // xmarkup will be taken equal to duration if (duration <= xmarkup) { xmarkup = duration + 1; } if (graph == "ampl") { Mat imgAmplitude = drawAmplitude(inputAudio); imgAmplitude = drawAmplitudeScale(imgAmplitude, inputAudio, samplingRate); imshow("Display amplitude graph", imgAmplitude); waitKey(0); } else if (graph == "spec") { vector>stft = STFT(inputAudio); Mat imgSpec = drawSpectrogram(stft); imgSpec = drawSpectrogramColorbar(imgSpec, inputAudio, samplingRate, stft); imshow("Display spectrogram", imgSpec); waitKey(0); } else if (graph == "ampl_and_spec") { Mat imgAmplitude = drawAmplitude(inputAudio); imgAmplitude = drawAmplitudeScale(imgAmplitude, inputAudio, samplingRate); vector>stft = STFT(inputAudio); Mat imgSpec = drawSpectrogram(stft); imgSpec = drawSpectrogramColorbar(imgSpec, inputAudio, samplingRate, stft); Mat imgTotal = concatenateImages(imgAmplitude, imgSpec); imshow("Display amplitude graph and spectrogram", imgTotal); waitKey(0); } } else if (draw == "dynamic") { if (inputType == "file") { dynamicFile(audio); } else if (inputType == "microphone") { dynamicMicrophone(); } } } ~AudioDrawing() { } int readAudioFile(string file, vector& inputAudio) { VideoCapture cap; vector params { CAP_PROP_AUDIO_STREAM, audioStream, CAP_PROP_VIDEO_STREAM, -1, CAP_PROP_AUDIO_DATA_DEPTH, CV_16S }; cap.open(file, CAP_ANY, params); if (!cap.isOpened()) { cerr << "Error : Can't read audio file: '" << audio << "' with audioStream = " << audioStream << endl; return -1; } const int audioBaseIndex = (int)cap.get(CAP_PROP_AUDIO_BASE_INDEX); const int numberOfChannels = (int)cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS); cout << "CAP_PROP_AUDIO_DATA_DEPTH: " << depthToString((int)cap.get(CAP_PROP_AUDIO_DATA_DEPTH)) << endl; int samplingRate = static_cast(cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND)); cout << "CAP_PROP_AUDIO_SAMPLES_PER_SECOND: " << cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND) << endl; cout << "CAP_PROP_AUDIO_TOTAL_CHANNELS: " << numberOfChannels << endl; cout << "CAP_PROP_AUDIO_TOTAL_STREAMS: " << cap.get(CAP_PROP_AUDIO_TOTAL_STREAMS) << endl; vector frameVec; Mat frame; for (;;) { if (cap.grab()) { cap.retrieve(frame, audioBaseIndex); frameVec = frame; inputAudio.insert(inputAudio.end(), frameVec.begin(), frameVec.end()); } else { cout << "Number of samples: " << inputAudio.size() << endl; break; } } return samplingRate; } int readAudioMicrophone(vector& inputAudio) { VideoCapture cap; vector params { CAP_PROP_AUDIO_STREAM, 0, CAP_PROP_VIDEO_STREAM, -1 }; cap.open(0, CAP_ANY, params); if (!cap.isOpened()) { cerr << "Error: Can't open microphone" << endl; return -1; } const int audioBaseIndex = static_cast(cap.get(CAP_PROP_AUDIO_BASE_INDEX)); const int numberOfChannels = static_cast(cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS)); cout << "CAP_PROP_AUDIO_DATA_DEPTH: " << depthToString( static_cast(cap.get(CAP_PROP_AUDIO_DATA_DEPTH))) << endl; int samplingRate = static_cast(cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND)); cout << "CAP_PROP_AUDIO_SAMPLES_PER_SECOND: " << samplingRate << endl; cout << "CAP_PROP_AUDIO_TOTAL_CHANNELS: " << numberOfChannels << endl; cout << "CAP_PROP_AUDIO_TOTAL_STREAMS: " << cap.get(CAP_PROP_AUDIO_TOTAL_STREAMS) << endl; const double cvTickFreq = getTickFrequency(); int64 sysTimeCurr = getTickCount(); int64 sysTimePrev = sysTimeCurr; vector frameVec; Mat frame; while ((sysTimeCurr - sysTimePrev) / cvTickFreq < microTime) { if (cap.grab()) { cap.retrieve(frame, audioBaseIndex); frameVec = frame; inputAudio.insert(inputAudio.end(), frameVec.begin(), frameVec.end()); sysTimeCurr = getTickCount(); } else { cerr << "Error: Grab error" << endl; break; } } cout << "Number of samples: " << inputAudio.size() << endl; return samplingRate; } Mat drawAmplitude(vector& inputAudio) { Scalar color = Scalar(247,111,87); int thickness = 5; int frameVectorRows = 500; int middle = frameVectorRows / 2; // usually the input data is too big, so it is necessary // to reduce size using interpolation of data int frameVectorCols = 40000; if (static_cast(inputAudio.size()) < frameVectorCols) { frameVectorCols = static_cast(inputAudio.size()); } Mat img(frameVectorRows, frameVectorCols, CV_8UC3 , Scalar(255,255,255)); // white background vectorreshapeAudio(inputAudio.size()); for (size_t i = 0; i < inputAudio.size(); ++i) { reshapeAudio[i]=static_cast(inputAudio[i]); } Mat img_frameVector( 1, static_cast(reshapeAudio.size()), CV_64F , reshapeAudio.data()); Mat img_frameVector_resize; resize(img_frameVector, img_frameVector_resize, Size(frameVectorCols, 1), INTER_LINEAR); reshapeAudio = img_frameVector_resize; // normalization data by maximum element normalize(reshapeAudio, reshapeAudio, 1.0, 0.0, NORM_INF); for (size_t i = 0; i < reshapeAudio.size(); ++i) { reshapeAudio[i] = middle - reshapeAudio[i] * middle; } for (int i = 1; i < static_cast(reshapeAudio.size()); ++i) { line(img, Point(i-1, static_cast(reshapeAudio[i-1])), Point(i, static_cast(reshapeAudio[i])), color, thickness); } Mat resImage; resize(img, resImage, Size(900, 400), INTER_AREA ); return resImage; } Mat drawAmplitudeScale(Mat& inputImg, const vector& inputAudio, int samplingRate, int xmin = 0, int xmax = 0) { // function of layout drawing for graph of volume amplitudes // x axis for time // y axis for amplitudes // parameters for the new image size int preCol = 100; int aftCol = 100; int preLine = 40; int aftLine = 50; int frameVectorRows = inputImg.rows; int frameVectorCols = inputImg.cols; int totalRows = preLine + frameVectorRows + aftLine; int totalCols = preCol + frameVectorCols + aftCol; Mat imgTotal = Mat(totalRows, totalCols, CV_8UC3, Scalar(255, 255, 255)); inputImg.copyTo(imgTotal(Rect(preCol, preLine, inputImg.cols, inputImg.rows))); // calculating values on x axis if (xmax == 0) { xmax = static_cast(inputAudio.size()) / samplingRate; } std::vector xList(xmarkup); if (xmax >= xmarkup) { double deltax = (xmax - xmin) / (xmarkup - 1); for (int i = 0; i < xmarkup; ++i) { xList[i] = (xmin + deltax * i); } } else { // this case is used to display a dynamic update vector tmpXList; for (int i = xmin; i < xmax; ++i) { tmpXList.push_back(i + 1); } int k = 0; for (int i = xmarkup - static_cast(tmpXList.size()); i < xmarkup; ++i) { xList[i] = tmpXList[k]; k += 1; } } // calculating values on y axis double minCv; double maxCv; Point minLoc; Point maxLoc; minMaxLoc(inputAudio, &minCv, &maxCv, &minLoc, &maxLoc); int ymin = static_cast(minCv); int ymax = static_cast(maxCv); std::vector yList(ymarkup); double deltay = (ymax - ymin) / (ymarkup - 1); for (int i = 0; i < ymarkup; ++i) { yList[i] = ymin + deltay * i; } // parameters for layout drawing int textThickness = 1; int gridThickness = 1; Scalar gridColor(0, 0, 0); Scalar textColor(0, 0, 0); float fontScale = 0.5; // horizontal axis line(imgTotal, Point(preCol, totalRows - aftLine), Point(preCol + frameVectorCols, totalRows - aftLine), gridColor, gridThickness); // vertical axis line(imgTotal, Point(preCol, preLine), Point(preCol, preLine + frameVectorRows), gridColor, gridThickness); // parameters for layout calculation int serifSize = 10; int indentDownX = serifSize * 2; int indentDownY = serifSize / 2; int indentLeftX = serifSize; int indentLeftY = 2 * preCol / 3; // drawing layout for x axis int numX = frameVectorCols / (xmarkup - 1); for (size_t i = 0; i < xList.size(); ++i) { int a1 = static_cast(preCol + i * numX); int a2 = frameVectorRows + preLine; int b1 = a1; int b2 = a2 + serifSize; if (enableGrid) { int d1 = a1; int d2 = preLine; line(imgTotal, Point(a1, a2), Point(d1, d2), gridColor, gridThickness); } line(imgTotal, Point(a1, a2), Point(b1, b2), gridColor, gridThickness); putText(imgTotal, to_string(int(xList[i])), Point(b1 - indentLeftX, b2 + indentDownX), FONT_HERSHEY_SIMPLEX, fontScale, textColor, textThickness); } // drawing layout for y axis int numY = frameVectorRows / (ymarkup - 1); for (size_t i = 0; i < yList.size(); ++i) { int a1 = preCol; int a2 = static_cast(totalRows - aftLine - i * numY); int b1 = preCol - serifSize; int b2 = a2; if (enableGrid) { int d1 = preCol + frameVectorCols; int d2 = a2; line(imgTotal, Point(a1, a2), Point(d1, d2), gridColor, gridThickness); } line(imgTotal, Point(a1, a2), Point(b1, b2), gridColor, gridThickness); putText(imgTotal, to_string(int(yList[i])), Point(b1 - indentLeftY, b2 + indentDownY), FONT_HERSHEY_SIMPLEX, fontScale, textColor, textThickness); } Mat resImage; resize(imgTotal, resImage, Size(cols, rows), INTER_AREA ); return resImage; } vector> STFT(const vector& inputAudio) { // The Short-time Fourier transform (STFT), is a Fourier-related transform used to // determine the sinusoidal frequency and phase content of local sections of a signal // as it changes over time. // In practice, the procedure for computing STFTs is to divide a longer time signal // into shorter segments of equal length and then compute the Fourier transform separately // on each shorter segment. This reveals the Fourier spectrum on each shorter segment. // One then usually plots the changing spectra as a function of time, known as a spectrogram // or waterfall plot. // https://en.wikipedia.org/wiki/Short-time_Fourier_transform int timeStep = windLen - overlap; Mat dstMat; vector stftRow; vector WindType; if (windowType == "Hann") { // https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows for (int j = 1 - windLen; j < windLen; j+=2) { WindType.push_back(j * (0.5 * (1 - cos(CV_PI * j / (windLen - 1))))); } } else if (windowType == "Hamming") { // https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows for (int j = 1 - windLen; j < windLen; j+=2) { WindType.push_back(j * (0.53836 - 0.46164 * (cos(CV_PI * j / (windLen - 1))))); } } for (size_t i = 0; i < inputAudio.size(); i += timeStep) { vectorsection(windLen, 0); for (int j = 0; j < windLen; ++j) { section[j] = inputAudio[j + i]; } if (windowType == "Hann" || windowType == "Hamming") { for (size_t j = 0; j < section.size(); ++j) { section[j] *= WindType[j]; } } dft(section, dstMat, DFT_COMPLEX_OUTPUT); for (int j = 0; j < dstMat.cols / 4; ++j) { double complModule = sqrt(dstMat.at(2*j) * dstMat.at(2*j) + dstMat.at(2*j+1) * dstMat.at(2*j+1)); stftRow.push_back(complModule); } } size_t xSize = inputAudio.size() / timeStep + 1; // we need only the first part of the spectrum, the second part is symmetrical size_t ySize = dstMat.cols / 4; vector> stft(ySize, vector(xSize, 0.)); for (size_t i = 0; i < xSize; ++i) { for (size_t j = 0; j < ySize; ++j) { // write elements with transposition and convert it to the decibel scale double stftElem = stftRow[ i * ySize + j]; if (stftElem != 0.) { stft[j][i] = 10 * log10(stftElem); } } } return stft; } Mat drawSpectrogram(const vector>& stft) { int frameVectorRows = static_cast(stft.size()); int frameVectorCols = static_cast(stft[0].size()); // Normalization of image values from 0 to 255 to get more contrast image // and this normalization will be taken into account in the scale drawing int colormapImageRows = 255; double minCv; double maxCv; Point minLoc; Point maxLoc; minMaxLoc(stft[0], &minCv, &maxCv, &minLoc, &maxLoc); double maxStft = max(abs(maxCv), abs(minCv)); for (int i = 1; i < frameVectorRows; ++i) { minMaxLoc( stft[i], &minCv, &maxCv, &minLoc, &maxLoc); maxStft = max(maxStft, max(abs(maxCv), abs(minCv))); } // if maxStft is zero (silence) if (maxStft == 0.) { maxStft = 1; } Mat imgSpec(frameVectorRows, frameVectorCols, CV_8UC1, Scalar(255, 255, 255)); for (int i = 0; i < frameVectorRows; ++i) { for (int j = 0; j < frameVectorCols; ++j) { imgSpec.at(frameVectorRows - i - 1, j) = static_cast(stft[i][j] * colormapImageRows / maxStft); } } applyColorMap(imgSpec, imgSpec, COLORMAP_INFERNO); Mat resImage; resize(imgSpec, resImage, Size(900, 400), INTER_AREA); return resImage; } Mat drawSpectrogramColorbar(Mat& inputImg, const vector& inputAudio, int samplingRate, const vector>& stft, int xmin = 0, int xmax = 0) { // function of layout drawing for the three-dimensional graph of the spectrogram // x axis for time // y axis for frequencies // z axis for magnitudes of frequencies shown by color scale // parameters for the new image size int preCol = 100; int aftCol = 100; int preLine = 40; int aftLine = 50; int colColor = 20; int indCol = 20; int frameVectorRows = inputImg.rows; int frameVectorCols = inputImg.cols; int totalRows = preLine + frameVectorRows + aftLine; int totalCols = preCol + frameVectorCols + aftCol; Mat imgTotal = Mat(totalRows, totalCols, CV_8UC3 , Scalar(255, 255, 255)); inputImg.copyTo(imgTotal(Rect(preCol, preLine, frameVectorCols, frameVectorRows))); // colorbar image due to drawSpectrogram(..) picture has been normalised from 255 to 0, // so here colorbar has values from 255 to 0 int colorArrSize = 256; Mat imgColorBar = Mat (colorArrSize, colColor, CV_8UC1 , Scalar(255,255,255)); for (int i = 0; i < colorArrSize; ++i) { for( int j = 0; j < colColor; ++j) { imgColorBar.at(i, j) = static_cast(colorArrSize - 1 - i); // from 255 to 0 } } applyColorMap(imgColorBar, imgColorBar, COLORMAP_INFERNO); resize(imgColorBar, imgColorBar, Size(colColor, frameVectorRows), INTER_AREA); imgColorBar.copyTo(imgTotal(Rect(preCol + frameVectorCols + indCol, preLine, colColor, frameVectorRows))); // calculating values on x axis if (xmax == 0) { xmax = static_cast(inputAudio.size()) / samplingRate + 1; } vector xList(xmarkup, 0); if (xmax >= xmarkup) { double deltax = (xmax - xmin) / (xmarkup - 1); for(int i = 0; i < xmarkup; ++i) { xList[i] = xmin + deltax * i; } } else { // this case is used to display a dynamic update vector tmpXList; for(int i = xmin; i < xmax; ++i) { tmpXList.push_back(i + 1); } int k = 0; for (int i = xmarkup - static_cast(tmpXList.size()); i < xmarkup; ++i) { xList[i] = tmpXList[k]; k += 1; } } // calculating values on y axis // according to the Nyquist sampling theorem, // signal should posses frequencies equal to half of sampling rate int ymin = 0; int ymax = static_cast(samplingRate / 2); vector yList; double deltay = (ymax - ymin) / (ymarkup - 1); for(int i = 0; i < ymarkup; ++i) { yList.push_back(ymin + deltay * i); } // calculating values on z axis double minCv; double maxCv; Point minLoc; Point maxLoc; minMaxLoc( stft[0], &minCv, &maxCv, &minLoc, &maxLoc); double zmin = minCv, zmax = maxCv; std::vector zList; for (size_t i = 1; i < stft.size(); ++i) { minMaxLoc( stft[i], &minCv, &maxCv, &minLoc, &maxLoc); zmax = max(zmax, maxCv); zmin = min(zmin, minCv); } double deltaz = (zmax - zmin) / (zmarkup - 1); for(int i = 0; i < zmarkup; ++i) { zList.push_back(zmin + deltaz * i); } // parameters for layout drawing int textThickness = 1; int gridThickness = 1; Scalar gridColor(0,0,0); Scalar textColor(0,0,0); float fontScale = 0.5; int serifSize = 10; int indentDownX = serifSize * 2; int indentDownY = serifSize / 2; int indentLeftX = serifSize; int indentLeftY = 2 * preCol / 3; // horizontal axis line(imgTotal, Point(preCol, totalRows - aftLine), Point(preCol + frameVectorCols, totalRows - aftLine), gridColor, gridThickness); // vertical axis line(imgTotal, Point(preCol, preLine), Point(preCol, preLine + frameVectorRows), gridColor, gridThickness); // drawing layout for x axis int numX = frameVectorCols / (xmarkup - 1); for (size_t i = 0; i < xList.size(); ++i) { int a1 = static_cast(preCol + i * numX); int a2 = frameVectorRows + preLine; int b1 = a1; int b2 = a2 + serifSize; line(imgTotal, Point(a1, a2), Point(b1, b2), gridColor, gridThickness); putText(imgTotal, to_string(static_cast(xList[i])), Point(b1 - indentLeftX, b2 + indentDownX), FONT_HERSHEY_SIMPLEX, fontScale, textColor, textThickness); } // drawing layout for y axis int numY = frameVectorRows / (ymarkup - 1); for (size_t i = 0; i < yList.size(); ++i) { int a1 = preCol; int a2 = static_cast(totalRows - aftLine - i * numY); int b1 = preCol - serifSize; int b2 = a2; line(imgTotal, Point(a1, a2), Point(b1, b2), gridColor, gridThickness); putText(imgTotal, to_string(static_cast(yList[i])), Point(b1 - indentLeftY, b2 + indentDownY), FONT_HERSHEY_SIMPLEX, fontScale, textColor, textThickness); } // drawing layout for z axis int numZ = frameVectorRows / (zmarkup - 1); for (size_t i = 0; i < zList.size(); ++i) { int a1 = preCol + frameVectorCols + indCol + colColor; int a2 = static_cast(totalRows - aftLine - i * numZ); int b1 = a1 + serifSize; int b2 = a2; line(imgTotal, Point(a1, a2), Point(b1, b2), gridColor, gridThickness); putText(imgTotal, to_string(static_cast(zList[i])), Point(b1 + 10, b2 + indentDownY), FONT_HERSHEY_SIMPLEX, fontScale, textColor, textThickness); } Mat resImage; resize(imgTotal, resImage, Size(cols, rows), INTER_AREA ); return resImage; } Mat concatenateImages(Mat& img1, Mat& img2) { // first image will be under the second image int totalRows = img1.rows + img2.rows; int totalCols = max(img1.cols , img2.cols); // if images columns do not match, the difference is filled in white Mat imgTotal = Mat (totalRows, totalCols, CV_8UC3 , Scalar(255, 255, 255)); img1.copyTo(imgTotal(Rect(0, 0, img1.cols, img1.rows))); img2.copyTo(imgTotal(Rect(0, img1.rows, img2.cols, img2.rows))); return imgTotal; } void dynamicFile(const string file) { VideoCapture cap; vector params { CAP_PROP_AUDIO_STREAM, audioStream, CAP_PROP_VIDEO_STREAM, -1, CAP_PROP_AUDIO_DATA_DEPTH, CV_16S }; cap.open(file, CAP_ANY, params); if (!cap.isOpened()) { cerr << "Error : Can't read audio file: '" << audio << "' with audioStream = " << audioStream << endl; return; } const int audioBaseIndex = static_cast(cap.get(CAP_PROP_AUDIO_BASE_INDEX)); const int numberOfChannels = static_cast(cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS)); int samplingRate = static_cast(cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND)); cout << "CAP_PROP_AUDIO_DATA_DEPTH: " << depthToString(static_cast(cap.get(CAP_PROP_AUDIO_DATA_DEPTH))) << endl; cout << "CAP_PROP_AUDIO_SAMPLES_PER_SECOND: " << cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND) << endl; cout << "CAP_PROP_AUDIO_TOTAL_CHANNELS: " << numberOfChannels << endl; cout << "CAP_PROP_AUDIO_TOTAL_STREAMS: " << cap.get(CAP_PROP_AUDIO_TOTAL_STREAMS) << endl; int step = static_cast(updateTime * samplingRate); int frameSize = static_cast(frameSizeTime * samplingRate); // since the dimensional grid is counted in integer seconds, // if duration of audio frame is less than xmarkup, to avoid an incorrect display, // xmarkup will be taken equal to duration if (frameSizeTime <= xmarkup) { xmarkup = frameSizeTime; } vector buffer; vector frameVector; vector section(frameSize, 0); vector>stft; Mat frame, imgAmplitude, imgSpec, imgTotal; int currentSamples = 0; int xmin = 0; int xmax = 0; for (;;) { if (cap.grab()) { cap.retrieve(frame, audioBaseIndex); frameVector = frame; buffer.insert(buffer.end(), frameVector.begin(), frameVector.end()); int bufferSize = static_cast(buffer.size()); if (bufferSize >= step) { currentSamples += bufferSize; section.erase(section.begin(), section.begin() + step); section.insert(section.end(), buffer.begin(), buffer.end()); buffer.erase(buffer.begin(), buffer.begin() + step); if (currentSamples < frameSize) { xmin = 0; xmax = (currentSamples) / samplingRate; } else { xmin = (currentSamples - frameSize) / samplingRate + 1; xmax = (currentSamples) / samplingRate; } if (graph == "ampl") { imgAmplitude = drawAmplitude(section); imgAmplitude = drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax); imshow("Display amplitude graph", imgAmplitude); waitKey(waitTime); } else if (graph == "spec") { stft = STFT(section); imgSpec = drawSpectrogram(stft); imgSpec = drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax); imshow("Display spectrogram", imgSpec); waitKey(waitTime); } else if (graph == "ampl_and_spec") { imgAmplitude = drawAmplitude(section); imgAmplitude = drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax); stft = STFT(section); imgSpec = drawSpectrogram(stft); imgSpec = drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax); imgTotal = concatenateImages(imgAmplitude, imgSpec); imshow("Display amplitude graph and spectrogram", imgTotal); waitKey(waitTime); } } } else { break; } } } void dynamicMicrophone() { VideoCapture cap; vector params { CAP_PROP_AUDIO_STREAM, 0, CAP_PROP_VIDEO_STREAM, -1 }; cap.open(0, CAP_MSMF, params); if (!cap.isOpened()) { cerr << "Error: Can't open microphone" << endl; return; } const int audioBaseIndex = static_cast(cap.get(CAP_PROP_AUDIO_BASE_INDEX)); const int numberOfChannels = static_cast(cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS)); int samplingRate = static_cast(cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND)); cout << "CAP_PROP_AUDIO_DATA_DEPTH: " << depthToString(static_cast(cap.get(CAP_PROP_AUDIO_DATA_DEPTH))) << endl; cout << "CAP_PROP_AUDIO_SAMPLES_PER_SECOND: " << cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND) << endl; cout << "CAP_PROP_AUDIO_TOTAL_CHANNELS: " << numberOfChannels << endl; cout << "CAP_PROP_AUDIO_TOTAL_STREAMS: " << cap.get(CAP_PROP_AUDIO_TOTAL_STREAMS) << endl; const double cvTickFreq = getTickFrequency(); int64 sysTimeCurr = getTickCount(); int64 sysTimePrev = sysTimeCurr; int step = (updateTime * samplingRate); int frameSize = (frameSizeTime * samplingRate); // since the dimensional grid is counted in integer seconds, // if duration of audio frame is less than xmarkup, to avoid an incorrect display, // xmarkup will be taken equal to duration if (frameSizeTime <= xmarkup) { xmarkup = frameSizeTime; } vector frameVector; vector buffer; vector section(frameSize, 0); Mat frame, imgAmplitude, imgSpec, imgTotal; int currentSamples = 0; vector> stft; int xmin = 0; int xmax = 0; waitTime = updateTime * 1000; while ((sysTimeCurr - sysTimePrev) / cvTickFreq < microTime) { if (cap.grab()) { cap.retrieve(frame, audioBaseIndex); frameVector = frame; buffer.insert(buffer.end(), frameVector.begin(), frameVector.end()); sysTimeCurr = getTickCount(); int bufferSize = static_cast(buffer.size()); if (bufferSize >= step) { currentSamples += step; section.erase(section.begin(), section.begin() + step); section.insert(section.end(), buffer.begin(), buffer.end()); buffer.erase(buffer.begin(), buffer.begin() + step); if (currentSamples < frameSize) { xmin = 0; xmax = (currentSamples) / samplingRate; } else { xmin = (currentSamples - frameSize) / samplingRate + 1; xmax = (currentSamples) / samplingRate; } if (graph == "ampl") { imgAmplitude = drawAmplitude(section); imgAmplitude = drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax); imshow("Display amplitude graph", imgAmplitude); waitKey(waitTime); } else if (graph == "spec") { stft = STFT(section); imgSpec = drawSpectrogram(stft); imgSpec = drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax); imshow("Display spectrogram", imgSpec); waitKey(waitTime); } else if (graph == "ampl_and_spec") { imgAmplitude = drawAmplitude(section); imgAmplitude = drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax); stft = STFT(section); imgSpec = drawSpectrogram(stft); imgSpec = drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax); imgTotal = concatenateImages(imgAmplitude, imgSpec); imshow("Display amplitude graph and spectrogram", imgTotal); waitKey(waitTime); } } } else { cerr << "Error: Grab error" << endl; break; } } } bool initAndCheckArgs(const CommandLineParser& parser) { inputType = parser.get("inputType"); if ((inputType != "file") && (inputType != "microphone")) { cout << "Error: " << inputType << " input method doesnt exist" << endl; return false; } draw = parser.get("draw"); if ((draw != "static") && (draw != "dynamic")) { cout << "Error: " << draw << " draw type doesnt exist" << endl; return false; } graph = parser.get("graph"); if ((graph != "ampl") && (graph != "spec") && (graph != "ampl_and_spec")) { cout << "Error: " << graph << " type of graph doesnt exist" << endl; return false; } audio = samples::findFile(parser.get("audio")); audioStream = parser.get("audioStream"); if (audioStream < 0) { cout << "Error: audioStream = " << audioStream << " - incorrect value. Must be >= 0" << endl; return false; } windowType = parser.get("windowType"); if ((windowType != "Rect") && (windowType != "Hann") && (windowType != "Hamming")) { cout << "Error: " << windowType << " type of window doesnt exist" << endl; return false; } windLen = parser.get("windLen"); if (windLen <= 0) { cout << "Error: windLen = " << windLen << " - incorrect value. Must be > 0" << endl; return false; } overlap = parser.get("overlap"); if (overlap <= 0) { cout << "Error: overlap = " << overlap << " - incorrect value. Must be > 0" << endl; return false; } enableGrid = parser.get("enableGrid"); rows = parser.get("rows"); if (rows <= 0) { cout << "Error: rows = " << rows << " - incorrect value. Must be > 0" << endl; return false; } cols = parser.get("cols"); if (cols <= 0) { cout << "Error: cols = " << cols << " - incorrect value. Must be > 0" << endl; return false; } xmarkup = parser.get("xmarkup"); if (xmarkup < 2) { cout << "Error: xmarkup = " << xmarkup << " - incorrect value. Must be >= 2" << endl; return false; } ymarkup = parser.get("ymarkup"); if (ymarkup < 2) { cout << "Error: ymarkup = " << ymarkup << " - incorrect value. Must be >= 2" << endl; return false; } zmarkup = parser.get("zmarkup"); if (zmarkup < 2) { cout << "Error: zmarkup = " << zmarkup << " - incorrect value. Must be >= 2" << endl; return false; } microTime = parser.get("microTime"); if (microTime <= 0) { cout << "Error: microTime = " << microTime << " - incorrect value. Must be > 0" << endl; return false; } frameSizeTime = parser.get("frameSizeTime"); if (frameSizeTime <= 0) { cout << "Error: frameSizeTime = " << frameSizeTime << " - incorrect value. Must be > 0" << endl; return false; } updateTime = parser.get("updateTime"); if (updateTime <= 0) { cout << "Error: updateTime = " << updateTime << " - incorrect value. Must be > 0" << endl; return false; } waitTime = parser.get("waitTime"); if (waitTime < 0) { cout << "Error: waitTime = " << waitTime << " - incorrect value. Must be >= 0" << endl; return false; } return true; } private : string inputType; string draw; string graph; string audio; int audioStream; string windowType; int windLen; int overlap; bool enableGrid; int rows; int cols; int xmarkup; int ymarkup; int zmarkup; int microTime; int frameSizeTime; int updateTime; int waitTime; }; int main(int argc, char** argv) { const String keys = "{help h usage ? | | this sample draws a volume graph and/or spectrogram of audio/video files and microphone \n\t\tDefault usage: ./Spectrogram.exe}" "{inputType i | file | file or microphone }" "{draw d | static | type of drawing: \n\t\t\tstatic - for plotting graph(s) across the entire input audio \n\t\t\tdynamic - for plotting graph(s) in a time-updating window}" "{graph g | ampl_and_spec | type of graph: amplitude graph or/and spectrogram. Please use tags below : \n\t\t\tampl - draw the amplitude graph \n\t\t\tspec - draw the spectrogram\n\t\t\tampl_and_spec - draw the amplitude graph and spectrogram on one image under each other}" "{audio a | Megamind.avi | name and path to file }" "{audioStream s | 1 | CAP_PROP_AUDIO_STREAM value. Select audio stream number }" "{windowType t | Rect | type of window for STFT. Please use tags below : \n\t\t\tRect/Hann/Hamming }" "{windLen l | 256 | size of window for STFT }" "{overlap o | 128 | overlap of windows for STFT }" "{enableGrid | false | grid on the amplitude graph }" "{rows r | 400 | rows of output image }" "{cols c | 900 | cols of output image }" "{xmarkup x | 5 | number of x axis divisions (time asix) }" "{ymarkup y | 5 | number of y axis divisions (frequency or/and amplitude axis) }" "{zmarkup z | 5 | number of z axis divisions (colorbar) }" "{microTime m | 20 | time of recording audio with microphone in seconds }" "{frameSizeTime f| 5 | size of sliding window in seconds }" "{updateTime u | 1 | update time of sliding window in seconds }" "{waitTime w | 10 | parameter to cv.waitKey() for dynamic update of file input, takes values in milliseconds }" ; CommandLineParser parser(argc, argv, keys); if (parser.has("help")) { parser.printMessage(); return 0; } AudioDrawing draw(parser); return 0; }