ronwpubs.bib

@conference{ellis06pvocvq,
  author = {D. P. W. Ellis and R. J. Weiss},
  title = {{Model-Based Monaural Source Separation Using a Vector-Quantized Phase-Vocoder Representation}},
  booktitle = {Proc. {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
  pages = {V-957-960},
  month = may,
  year = {2006},
  address = {Toulouse, France},
  doi = {10.1109/ICASSP.2006.1661436},
  pdf = {pubs/icassp2006-pvocvq.pdf}
}
@conference{weiss06rvmsep,
  author = {R. J. Weiss and D. P. W. Ellis},
  title = {{Estimating Single-Channel Source Separation Masks: Relevance Vector Machine Classifiers vs. Pitch-Based Masking}},
  booktitle = {Proc. ISCA Tutorial and Research Workshop on Statistical Perceptual Audition (SAPA)},
  pages = {31-36},
  month = sep,
  year = {2006},
  address = {Pittsburgh, USA},
  http = {http://www.isca-speech.org/archive/sapa_2006/sap6_031.html},
  pdf = {pubs/sapa2006-rvmpvsourcesep.pdf},
  slides = {pubs/sapa2006-rvmpvsourcesep-slides.pdf}
}
@conference{weiss07adapted_models,
  author = {R. J. Weiss and D. P. W. Ellis},
  title = {{Monaural Speech Separation Using Source-Adapted Models}},
  booktitle = {Proc. {IEEE} Workshop on Applications of Signal Processing to Audio and Acoustics ({WASPAA})},
  month = oct,
  year = 2007,
  pages = {114-117},
  address = {New Paltz, USA},
  doi = {10.1109/ASPAA.2007.4393039},
  pdf = {pubs/waspaa2007-adapted_models.pdf},
  slides = {pubs/waspaa2007-adapted_models-slides.pdf},
  web = {SSC.html}
}
@conference{weiss08messlsp,
  author = {R. J. Weiss and M. I. Mandel and D. P. W. Ellis},
  title = {{Source Separation Based on Binaural Cues and Source Model Constraints}},
  booktitle = {Proc. Interspeech},
  pages = {419-422},
  month = sep,
  year = {2008},
  address = {Brisbane, Australia},
  http = {http://www.isca-speech.org/archive/interspeech_2008/i08_0419.html},
  pdf = {pubs/icslp2008-messl_sp.pdf},
  poster = {pubs/icslp2008-messl_sp-poster.pdf}
}
@conference{weiss08dysana,
  author = {R. J. Weiss and T. Kristjansson},
  title = {{{DySANA}: Dynamic Speech and Noise Adaptation for Voice Activity Detection}},
  booktitle = {Proc. Interspeech},
  pages = {127-130},
  month = sep,
  year = {2008},
  address = {Brisbane, Australia},
  http = {http://www.isca-speech.org/archive/interspeech_2008/i08_0127.html},
  pdf = {pubs/icslp2008-dysana.pdf},
  poster = {pubs/icslp2008-dysana-poster.pdf}
}
@conference{weiss09vem,
  author = {R. J. Weiss and D. P. W. Ellis},
  title = {{A Variational EM Algorithm for Learning Eigenvoice Parameters in Mixed Signals}},
  booktitle = {Proc. {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
  pages = {113-116},
  month = apr,
  year = 2009,
  address = {Taipei, Taiwan},
  doi = {10.1109/ICASSP.2009.4959533},
  pdf = {pubs/icassp2009-ev_vem.pdf},
  poster = {pubs/icassp2009-ev_vem-poster.pdf}
}
@phdthesis{weiss09thesis,
  title = {Underdetermined Source Separation Using Speaker Subspace Models},
  author = {R. J. Weiss},
  year = {2009},
  school = {Department of {E}lectrical {E}ngineering, Columbia University},
  publisher = {Columbia University},
  pdf = {pubs/ronw-thesis.pdf},
  slides = {pubs/ronw-thesis-slides.pdf},
  abstract = {Sounds rarely occur in isolation. Despite this, significant effort has been dedicated to the
design of computer audition systems, such as speech recognizers, that can only analyze
isolated sound sources. In fact, there are a variety of applications in both human and
computer audition for which it is desirable to understand more complex auditory scenes.
In order to extend such systems to operate on mixtures of many sources, the ability to
recover the source signals from the mixture is required. This process is known as source
separation.

In this thesis we focus on the problem of underdetermined source separation where the
number of sources is greater than the number of channels in the observed mixture. In
the worst case, when the observations are derived from a single microphone, it is often
necessary for a separation algorithm to utilize prior information about the sources present
in the mixture to constrain possible source reconstructions. A common approach for
separating such signals is based on the use of source-specific statistical models. In most
cases this approach requires that significant training data be available to train models for
the sources known in advance to be present in the mixed signal. We propose a speaker
subspace model for source adaptation that alleviates this requirement.

We report a series of experiments on monaural mixtures of speech signals and demonstrate
that the use of the proposed speaker subspace model can separate sources far better than
the use of unadapted, source-independent models. The proposed method also outperforms
other state of the art approaches when training data is not available for the exact speakers
present in the mixed signal.

Finally, we describe an system for binaural speech separation that combines constraints
based on interaural localization cues with constraints derived from source models. Although a simpler system based only on localization cues is sometimes able to adequately
isolate sources, the incorporation of a source-independent model is shown to significantly
improve performance. Further improvements are obtained by using the proposed speaker
subspace model to adapt to match the sources present in the signal.}
}
@article{weiss10ssc,
  title = {{Speech Separation Using Speaker-Adapted Eigenvoice Speech Models}},
  author = {R. J. Weiss and D. P. W. Ellis},
  journal = {Computer Speech and Language},
  month = jan,
  year = {2010},
  volume = {24},
  number = {1},
  pages = {16-29},
  note = {Speech Separation and Recognition Challenge},
  issn = {0885-2308},
  doi = {10.1016/j.csl.2008.03.003},
  pdf = {pubs/csl2008-eigenvoice_speech_sep.pdf},
  abstract = {We present a system for model-based source separation for use on single channel speech mixtures where the precise source characteristics are not known a priori.  The sources are modeled using hidden Markov models (HMM) and separated using factorial HMM methods.  Without prior speaker models for the sources in the mixture it is difficult to exactly resolve the individual sources because there is no way to determine which state corresponds to which source at any point in time.  This is solved to a small extent by the temporal constraints provided by the Markov models, but permutations between sources remains a significant problem.  We overcome this by adapting the models to match the sources in the mixture.  We do this by representing the space of speaker variation with a parametric signal model based on the eigenvoice technique for rapid speaker adaptation.  We present an algorithm to infer the characteristics of the sources present in a mixture, allowing for significantly improved separation performance over that obtained using unadapted source models.  The algorithm is evaluated on the task defined in the 2006 Speech Separation Challenge and compared with separation using source-dependent models.  Although performance is not as good as with speaker-dependent models, we show that the system based on model adaptation is able to generalize better to held out speakers.}
}
@article{mandel10messl,
  title = {{Model-Based Expectation-Maximization Source Separation and Localization}},
  author = {M. I. Mandel and R. J. Weiss and D. P. W. Ellis},
  journal = {{IEEE} Transactions on Audio, Speech, and Language Processing},
  year = {2010},
  month = feb,
  volume = {18},
  number = {2},
  pages = {382-394},
  doi = {10.1109/TASL.2009.2029711},
  issn = {1558-7916},
  pdf = {pubs/taslp09-messl.pdf},
  abstract = {This paper describes a system, referred to as model-based expectation-maximization source separation and localization (MESSL), for separating and localizing multiple sound sources from an underdetermined reverberant two-channel recording. By clustering individual spectrogram points based on their interaural phase and level differences, MESSL generates masks that can be used to isolate individual sound sources. We first describe a probabilistic model of interaural parameters that can be evaluated at individual spectrogram points. By creating a mixture of these models over sources and delays, the multi-source localization problem is reduced to a collection of single source problems. We derive an expectation-maximization algorithm for computing the maximum-likelihood parameters of this mixture model, and show that these parameters correspond well with interaural parameters measured in isolation. As a byproduct of fitting this mixture model, the algorithm creates probabilistic spectrogram masks that can be used for source separation. In simulated anechoic and reverberant environments, separations using MESSL produced on average a signal-to-distortion ratio 1.6 dB greater and perceptual evaluation of speech quality (PESQ) results 0.27 mean opinion score units greater than four comparable algorithms.},
  web = {http://github.com/mim/messl}
}
@inproceedings{weiss10nmfseg,
  author = {R. J. Weiss and J. P. Bello},
  title = {{Identifying Repeated Patterns in Music Using Sparse Convolutive Non-Negative Matrix Factorization}},
  booktitle = {Proc. International Society for Music Information Retrieval Conference ({ISMIR})},
  pages = {123-128},
  month = aug,
  year = 2010,
  address = {Utrecht, Netherlands},
  note = {Best Paper Award},
  pdf = {pubs/ismir2010-nmfseg.pdf},
  web = {http://ronw.github.com/siplca-segmentation},
  slides = {pubs/ismir2010-nmfseg-slides.pdf}
}
@inproceedings{bertin10patterns,
  author = {T. Bertin-Mahieux and R. J. Weiss and D. P. W. Ellis},
  title = {{Clustering Beat-Chroma Patterns in a Large Music Database}},
  booktitle = {Proc. International Society for Music Information Retrieval Conference ({ISMIR})},
  pages = {111-116},
  month = aug,
  year = 2010,
  address = {Utrecht, Netherlands},
  pdf = {pubs/ismir2010-beatchromapatterns.pdf},
  web = {http://www.columbia.edu/~tb2332/ProjClustering/ClusteringChromas.html}
}
@inproceedings{cho10chordreco,
  author = {T. Cho and R. J. Weiss and J. P. Bello},
  title = {{Exploring Common Variations in State of the Art Chord Recognition Systems}},
  booktitle = {Proc. Sound and Music Computing Conference ({SMC})},
  pages = {1-8},
  month = jul,
  year = 2010,
  address = {Barcelona, Spain},
  pdf = {pubs/smc2010-chordreco.pdf}
}
@article{weiss11siplca,
  author = {R. J. Weiss and J. P. Bello},
  title = {{Unsupervised Discovery of Temporal Structure in Music}},
  journal = {IEEE Journal of Selected Topics in Signal Processing},
  year = 2011,
  month = oct,
  volume = 5,
  number = 6,
  pages = {1240-1251},
  issn = {1932-4553},
  doi = {10.1109/JSTSP.2011.2145356},
  pdf = {pubs/jstsp2011-siplca.pdf}
}
@article{weiss11messlev,
  title = {Combining Localization Cues and Source Model Constraints
           for Binaural Source Separation},
  author = {R. J. Weiss and M. I. Mandel and D. P. W. Ellis},
  journal = {Speech Communication},
  note = {Special issue on Perceptual and Statistical Audition},
  year = 2011,
  month = may,
  volume = {53},
  number = {5},
  pages = {606-621},
  issn = {0167-6393},
  doi = {10.1016/j.specom.2011.01.003},
  pdf = {pubs/specom2011-messlev.pdf}
}
@inproceedings{bertin11evaluating,
  author = {T. Bertin-Mahieux and G. Grindlay and R. J. Weiss and D. P. W. Ellis},
  title = {{Evaluating Music Sequence Models Through Missing Data}},
  booktitle = {Proc. {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
  month = may,
  pages = {177-180},
  year = 2011,
  address = {Prague, Czech Republic},
  issn = {1520-6149},
  doi = {10.1109/ICASSP.2011.5946369},
  pdf = {pubs/icassp2011-imputation.pdf}
}
@article{pedregosa11scikit-learn,
  title = {scikit-learn: Machine Learning in Python},
  author = {F. Pedregosa and G. Varoquaux and A. Gramfort and V. Michel
                  and B. Thirion and O. Grisel and M. Blondel
                  and P. Prettenhofer and R. Weiss and V. Dubourg
                  and J. Vanderplas and A. Passos and D. Cournapeau
                  and M. Brucher and M. Perrot and \'{E}. Duchesnay},
  journal = {Journal of Machine Learning Research},
  volume = 12,
  month = oct,
  pages = {2825-2830},
  year = {2011},
  http = {http://jmlr.org/papers/v12/pedregosa11a.html},
  pdf = {pubs/jmlr2011-scikit-learn.pdf},
  arxiv = {https://arxiv.org/abs/1201.0490}
}
@inproceedings{weston12lcr,
  title = {Latent Collaborative Retrieval},
  author = {J. Weston and C. Wang and R. Weiss and A. Berenzweig},
  booktitle = {Proc. International Conference on Machine Learning ({ICML})},
  month = jun,
  year = {2012},
  address = {Edinburgh, Scotland},
  http = {http://icml.cc/discuss/2012/12.html},
  pdf = {pubs/icml2012-lcr.pdf},
  arxiv = {https://arxiv.org/abs/1206.4603}
}
@inproceedings{weston13awe,
  title = {Affinity Weighted Embedding},
  author = {J. Weston and R. Weiss and H. Yee},
  booktitle = {Proc. International Conference on Learning Representations ({ICLR})},
  month = may,
  year = {2013},
  address = {Scottsdale, USA},
  http = {http://openreview.net/document/8bc82d3f-df5e-4602-bc3d-2f6fa0196f5f},
  pdf = {pubs/iclr2013-awe.pdf},
  arxiv = {https://arxiv.org/abs/1301.4171}
}
@inproceedings{weston13usermax,
  title = {Nonlinear Latent Factorization by Embedding Multiple User Interests},
  author = {J. Weston and R. J. Weiss and H. Yee},
  booktitle = {Proc. ACM Conference on Recommender Systems ({RecSys})},
  pages = {65-68},
  month = oct,
  year = {2013},
  address = {Hong Kong},
  doi = {10.1145/2507157.2507209},
  pdf = {pubs/recsys2013-usermax.pdf}
}
@inproceedings{weston13kaos,
  title = {Learning to Rank Recommendations with the k-order Statistic Loss},
  author = {J. Weston and H. Yee and R. J. Weiss},
  booktitle = {Proc. ACM Conference on Recommender Systems ({RecSys})},
  pages = {245-248},
  month = oct,
  year = {2013},
  address = {Hong Kong},
  doi = {10.1145/2507157.2507210},
  pdf = {pubs/recsys2013-kaos.pdf}
}
@inproceedings{weston14awe,
  title = {Affinity Weighted Embedding},
  author = {J. Weston and R. Weiss and H. Yee},
  booktitle = {Proc. International Conference on Machine Learning ({ICML})},
  pages = {1215-1223},
  month = jun,
  year = {2014},
  address = {Beijing, China},
  http = {http://jmlr.org/proceedings/papers/v32/weston14.html},
  pdf = {pubs/icml2014-awe.pdf}
}
@inproceedings{hoshen15waveformam,
  author = {Y. Hoshen and R. J. Weiss and K. W. Wilson},
  title = {{Speech Acoustic Modeling from Raw Multichannel Waveforms}},
  booktitle = {Proc. {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
  month = apr,
  year = 2015,
  address = {Brisbane, Australia},
  doi = {10.1109/ICASSP.2015.7178847},
  pdf = {pubs/icassp2015-waveformam.pdf}
}
@inproceedings{sainath15waveform_cldnn,
  title = {Learning the Speech Front-End with Raw Waveform CLDNNs},
  author = {T. N. Sainath and R. J. Weiss and A. Senior and K. W. Wilson and O. Vinyals},
  booktitle = {Proc. Interspeech},
  month = sep,
  year = 2015,
  address = {Dresden, Germany},
  pdf = {pubs/interspeech2015-waveform_cldnn.pdf}
}
@inproceedings{sainath15multichannel,
  title = {Speaker Location and Microphone Spacing Invariant Acoustic Modeling from Raw Multichannel Waveforms},
  author = {T. N. Sainath and R. J. Weiss and K. W. Wilson and A. Narayanan and M. Bacchiani and A. Senior},
  booktitle = {Proc. {IEEE} Automatic Speech Recognition and Understanding Workshop ({ASRU})},
  month = dec,
  year = 2015,
  address = {Scottsdale, USA},
  doi = {10.1109/ASRU.2015.7404770},
  pdf = {pubs/asru2015-multichannel_cldnn.pdf}
}
@inproceedings{sainath16factored,
  title = {Factored Spatial and Spectral Multichannel Raw Waveform {CLDNN}s},
  author = {T. N. Sainath and R. J. Weiss and K. W. Wilson and A. Narayanan and M. Bacchiani},
  booktitle = {Proc. {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
  month = mar,
  year = 2016,
  address = {Shanghai, China},
  doi = {10.1109/ICASSP.2017.7952132},
  pdf = {pubs/icassp2016-factored_cldnn.pdf}
}
@inproceedings{li16adaptive,
  title = {Neural Network Adaptive Beamforming for Robust Multichannel Speech Recognition},
  author = {B. Li and T. N. Sainath and R. J. Weiss and K. W. Wilson and M. Bacchiani},
  booktitle = {Proc. Interspeech},
  month = sep,
  year = 2016,
  address = {San Francisco, USA},
  doi = {10.21437/Interspeech.2016-173},
  pdf = {pubs/interspeech2016-waveform_cldnn_adaptive.pdf}
}
@inproceedings{sainath16speedups,
  title = {Reducing the Computational Complexity of Multimicrophone Acoustic Models with Integrated Feature Extraction},
  author = {T. N. Sainath and A. Narayanan and R. J. Weiss and E. Variani and K. W. Wilson and M. Bacchiani and I. Shafran},
  booktitle = {Proc. Interspeech},
  month = sep,
  year = 2016,
  address = {San Francisco, USA},
  doi = {10.21437/Interspeech.2016-92},
  pdf = {pubs/interspeech2016-waveform_cldnn_speedups.pdf}
}
@inproceedings{hershey17audiocnn,
  title = {{CNN} Architectures for Large-Scale Audio Classification},
  author = {S. Hershey and S. Chaudhuri and D. P. W. Ellis and J. F. Gemmeke and A. Jansen and R. C. Moore and M. Plakal and D. Platt and R. A. Saurous and B. Seybold and M. Slaney and R. J. Weiss and K. Wilson},
  booktitle = {Proc. {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
  month = mar,
  year = 2017,
  address = {New Orleans, USA},
  arxiv = {https://arxiv.org/abs/1609.09430},
  doi = {10.1109/ICASSP.2017.7952132},
  pdf = {pubs/icassp2017-audiocnn.pdf}
}
@article{sainath17multichannel,
  title = {Multichannel Signal Processing with Deep Neural Networks for Automatic Speech Recognition},
  author = {T. N. Sainath and R. J. Weiss and K. W. Wilson and B. Li and A. Narayanan and E. Variani and M. Bacchiani and I. Shafran and A. Senior and K. W. Chin and A. Misra and C. Kim},
  journal = {{IEEE/ACM} Transactions on Audio, Speech, and Language Processing},
  month = feb,
  year = {2017},
  volume = {25},
  number = {5},
  pages = {965-979},
  publisher = {IEEE},
  doi = {10.1109/TASLP.2017.2672401},
  pdf = {pubs/taslp2017-multichannel.pdf}
}
@incollection{sainath17raw,
  title = {Raw Multichannel Processing Using Deep Neural Networks},
  author = {T. N. Sainath and R. J. Weiss and K. W. Wilson and B. Li and A. Narayanan and E. Variani and M. Bacchiani and I. Shafran and A. Senior and K. W. Chin and A. Misra and C. Kim},
  editors = {Shinji Watanabe and Marc Delcroix and Florian Metze and John R. Hershey},
  booktitle = {New Era for Robust Speech Recognition: Exploiting Deep Learning},
  publisher = {Springer},
  year = {2017},
  doi = {10.1007/978-3-319-64680-0},
  pdf = {pubs/jsalt2017-raw.pdf}
}
@inproceedings{raffel2017online,
  title = {Online and Linear-Time Attention by Enforcing Monotonic Alignments},
  author = {C. Raffel and T. Luong and P. J. Liu and R. J. Weiss and D. Eck},
  booktitle = {Proc. International Conference on Machine Learning ({ICML})},
  month = aug,
  year = {2017},
  address = {Sydney, Australia},
  arxiv = {https://arxiv.org/abs/1704.00784},
  http = {http://proceedings.mlr.press/v70/raffel17a.html}
}
@inproceedings{wang2017tacotron,
  title = {Tacotron: Towards End-To-End Speech Synthesis},
  author = {Y. Wang and R. J. Skerry-Ryan and D. Stanton and Y. Wu and
                  R. J. Weiss and N. Jaitly and Z. Yang and Y. Xiao and
                  Z. Chen and S. Bengio and Q. Le and
                  Y. Agiomyrgiannakis and R. Clark and R. A. Saurous},
  booktitle = {Proc. Interspeech},
  month = aug,
  year = {2017},
  address = {Stockholm, Sweden},
  doi = {10.21437/Interspeech.2017-1452},
  arxiv = {https://arxiv.org/abs/1703.10135}
}
@inproceedings{weiss2017sequence,
  title = {Sequence-to-Sequence Models Can Directly Translate Foreign Speech},
  author = {R. J. Weiss and J. Chorowski and N. Jaitly and Y. Wu and Z. Chen},
  booktitle = {Proc. Interspeech},
  month = aug,
  year = {2017},
  address = {Stockholm, Sweden},
  doi = {10.21437/Interspeech.2017-503},
  arxiv = {https://arxiv.org/abs/1703.08581},
  slides = {pubs/interspeech2017-speech_translation-slides.pdf}
}
@inproceedings{li2017acoustic,
  title = {Acoustic Modeling for Google Home},
  author = {B. Li and T. N. Sainath and A. Narayanan and J. Caroselli and
                    M. Bacchiani and A. Misra and I. Shafran and H. Sak and
                    G. Pundak and K. Chin and  K. C. Sim and R. J. Weiss and
                    K. Wilson and E. Variani and C. Kim and O. Siohan and
                    M. Weintraub and E. McDermott and R. Rose and M. Shannon},
  booktitle = {Proc. Interspeech},
  month = aug,
  year = {2017},
  address = {Stockholm, Sweden},
  doi = {10.21437/Interspeech.2017-234},
  pdf = {pubs/interspeech2017-googlehome.pdf}
}
@incollection{bello18content,
  title = {Content-Based Methods for Knowledge Discovery in Music},
  author = {J. P. Bello and P. Grosche and M. M\"{u}ller and R. Weiss},
  editors = {R. Bader},
  booktitle = {Springer Handbook of Systematic Musicology},
  pages = {823-840},
  publisher = {Springer},
  year = {2018},
  month = mar,
  doi = {10.1007/978-3-662-55004-5_39}
}
@inproceedings{shen2018tacotron2,
  title = {Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions},
  author = {J. Shen and R. Pang and R. J. Weiss and M. Schuster and N. Jaitly and Z. Yang and Z. Chen and Y. Zhang and Y. Wang and R. J. Skerry-Ryan and R. A. Saurous and Y. Agiomyrgiannakis and Y. Wu},
  booktitle = {Proc. {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
  month = apr,
  year = {2018},
  address = {Calgary, Canada},
  arxiv = {https://arxiv.org/abs/1712.05884},
  web = {https://research.googleblog.com/2017/12/tacotron-2-generating-human-like-speech.html}
}
@inproceedings{chorowski2018styletransfer,
  title = {On Using Backpropagation for Speech Texture Generation and Voice Conversion},
  author = {J. Chorowski and R. J. Weiss and R. A. Saurous and S. Bengio},
  booktitle = {Proc. {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
  month = apr,
  year = {2018},
  address = {Calgary, Canada},
  arxiv = {https://arxiv.org/abs/1712.08363},
  web = {https://google.github.io/speech_style_transfer/samples.html}
}
@inproceedings{toshniwal2018multilingual,
  title = {Multilingual Speech Recognition With A Single End-To-End Model},
  author = {S. Toshniwal and T. N. Sainath and R. J. Weiss and B. Li and P. Moreno and E. Weinstein and K. Rao},
  booktitle = {Proc. {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
  month = apr,
  year = {2018},
  address = {Calgary, Canada},
  arxiv = {https://arxiv.org/abs/1711.01694},
  web = {https://research.googleblog.com/2017/12/improving-end-to-end-models-for-speech.html}
}
@inproceedings{chiu2018sota,
  title = {State-of-the-art Speech Recognition With Sequence-to-Sequence Models},
  author = {C.-C. Chiu and T. N. Sainath and Y. Wu and R. Prabhavalkar and P. Nguyen and Z. Chen and A. Kannan and R. J. Weiss and K. Rao and K. Gonina and N. Jaitly and B. Li and J. Chorowski and M. Bacchiani},
  booktitle = {Proc. {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
  month = apr,
  year = {2018},
  address = {Calgary, Canada},
  arxiv = {https://arxiv.org/abs/1712.01769},
  web = {https://research.googleblog.com/2017/12/improving-end-to-end-models-for-speech.html}
}
@inproceedings{skerryryan2018prosody,
  title = {Towards End-to-End Prosody Transfer for Expressive Speech Synthesis with Tacotron},
  author = {R. J. Skerry-Ryan and E. Battenberg and Y. Xiao and Y. Wang and D. Stanton and J. Shor and R. J. Weiss and R. Clark and R. A. Saurous},
  booktitle = {Proc. International Conference on Machine Learning ({ICML})},
  year = {2018},
  month = jul,
  address = {Stockholm, Sweden},
  arxiv = {https://arxiv.org/abs/1803.09047},
  web = {https://research.googleblog.com/2018/03/expressive-speech-synthesis-with.html}
}
@article{antognini18textures,
  title = {Synthesizing Diverse, High-Quality Audio Textures},
  author = {J. Antognini and M. Hoffman and R. J. Weiss},
  journal = {arXiv preprint arXiv:1806.08002},
  month = jun,
  year = {2018},
  arxiv = {https://arxiv.org/abs/1806.08002},
  web = {https://antognini-google.github.io/audio_textures/}
}
@inproceedings{jia2018multispeaker,
  title = {Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis},
  author = {Y. Jia and Y. Zhang and R. J. Weiss and Q. Wang and J. Shen and F. Ren and Z. Chen and P. Nguyen and R. Pang and I. Lopez-Moreno and Y. Wu},
  booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
  year = {2018},
  month = dec,
  address = {Montr\'{e}al, Canada},
  arxiv = {https://arxiv.org/abs/1806.04558},
  reviews = {http://papers.nips.cc/paper/7700-transfer-learning-from-speaker-verification-to-multispeaker-text-to-speech-synthesis},
  web = {https://google.github.io/tacotron/publications/speaker_adaptation},
  poster = {https://google.github.io/tacotron/publications/speaker_adaptation/poster.pdf}
}
@inproceedings{hsu2018disentangling,
  author = {W. N. Hsu and Y. Zhang and R. J. Weiss and Y. A. Chung and Y. Wang and Y. Wu and J. Glass},
  title = {Disentangling Correlated Speaker and Noise for Speech Synthesis via Data Augmentation and Adversarial Factorization},
  booktitle = {NeurIPS 2018 Workshop on Interpretability and Robustness in Audio, Speech, and Language},
  month = dec,
  year = {2018},
  address = {Montr\'{e}al, Canada},
  note = {also at \href{https://dx.doi.org/10.1109/ICASSP.2019.8683561}{ICASSP 2019}},
  reviews = {https://openreview.net/forum?id=Bkg9ZeBB37},
  web = {https://google.github.io/tacotron/publications/adv_tts}
}
@article{chorowski2019unsupervised,
  title = {Unsupervised speech representation learning using {WaveNet} autoencoders},
  author = {J. Chorowski and R. J. Weiss and S. Bengio and A. van den Oord},
  journal = {{IEEE/ACM} Transactions on Audio, Speech, and Language Processing},
  month = dec,
  year = {2019},
  volume = {27},
  number = {12},
  pages = {2041-2053},
  publisher = {IEEE},
  arxiv = {https://arxiv.org/abs/1901.08810},
  doi = {10.1109/TASLP.2019.2938863}
}
@inproceedings{hsu2019hierarchical,
  title = {Hierarchical Generative Modeling for Controllable Speech Synthesis},
  author = {W. N. Hsu and Y. Zhang and R. J. Weiss and H. Zen and Y. Wu and Y. Wang and Y. Cao and Y. Jia and Z. Chen and J. Shen and P. Nguyen and R. Pang},
  booktitle = {Proc. International Conference on Learning Representations ({ICLR})},
  month = may,
  year = {2019},
  address = {New Orleans, USA},
  arxiv = {https://arxiv.org/abs/1810.07217},
  web = {https://google.github.io/tacotron/publications/gmvae_controllable_tts},
  reviews = {https://openreview.net/forum?id=rygkk305YQ}
}
@inproceedings{jia2019leveraging,
  author = {Y. Jia and M. Johnson and W. Macherey and R. J. Weiss and Y. Cao and C.-C. Chiu and N. Ari and S. Laurenzo and Y. Wu},
  title = {Leveraging Weakly Supervised Data to Improve End-to-End Speech-to-Text Translation},
  booktitle = {Proc. {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
  month = may,
  year = {2019},
  address = {Brighton, UK},
  doi = {10.1109/ICASSP.2019.8683343},
  arxiv = {https://arxiv.org/abs/1811.02050},
  slides = {pubs/icassp2019-speech_translation-slides.pdf}
}
@inproceedings{guo2019spelling,
  title = {A Spelling Correction Model for End-to-End Speech Recognition},
  author = {J. Guo and T. N. Sainath and R. J.  Weiss},
  booktitle = {Proc. {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
  month = may,
  year = {2019},
  address = {Brighton, UK},
  doi = {10.1109/ICASSP.2019.8683745},
  arxiv = {https://arxiv.org/abs/1902.07178},
  slides = {pubs/icassp2019-spelling-slides.pdf}
}
@inproceedings{antognini2019audio,
  title = {Audio Texture Synthesis with Random Neural Networks: Improving Diversity and Quality},
  author = {J. M. Antognini and M. Hoffman and R. J. Weiss},
  booktitle = {Proc. {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
  month = may,
  year = {2019},
  address = {Brighton, UK},
  doi = {10.1109/ICASSP.2019.8682598},
  web = {https://antognini-google.github.io/audio_textures},
  pdf = {pubs/icassp2019-texture.pdf},
  poster = {pubs/icassp2019-texture-poster.pdf}
}
@inproceedings{wang2019voicefilter,
  author = {Q. Wang and H. Muckenhirn and K. Wilson and P. Sridhar and Z. Wu and J. Hershey and  R. A. Saurous and R. J. Weiss and Y. Jia and I. Lopez-Moreno},
  title = {{VoiceFilter}: Targeted Voice Separation by Speaker-Conditioned Spectrogram Masking},
  booktitle = {Proc. Interspeech},
  month = sep,
  year = {2019},
  address = {Graz, Austria},
  doi = {10.21437/Interspeech.2019-1101},
  arxiv = {https://arxiv.org/abs/1810.04826},
  web = {https://google.github.io/speaker-id/publications/VoiceFilter}
}
@inproceedings{jia2019direct,
  title = {Direct Speech-to-Speech Translation with a Sequence-to-Sequence Model},
  author = {Y. Jia and R. J. Weiss and F. Biadsy and W. Macherey and M. Johnson and Z. Chen and Y. Wu},
  booktitle = {Proc. Interspeech},
  month = sep,
  year = {2019},
  address = {Graz, Austria},
  doi = {10.21437/Interspeech.2019-1951},
  arxiv = {https://arxiv.org/abs/1904.06037},
  web = {https://google-research.github.io/lingvo-lab/translatotron}
}
@inproceedings{biadsy2019parrotron,
  title = {Parrotron: An End-to-End Speech-to-Speech Conversion Model and its Applications to Hearing-Impaired Speech and Speech Separation},
  author = {F. Biadsy and R. J. Weiss and P. J. Moreno and D. Kanvesky and Y. Jia},
  booktitle = {Proc. Interspeech},
  month = sep,
  year = {2019},
  address = {Graz, Austria},
  doi = {10.21437/Interspeech.2019-1789},
  arxiv = {https://arxiv.org/abs/1904.04169},
  web = {https://google.github.io/tacotron/publications/parrotron}
}
@inproceedings{zen2019libritts,
  title = {LibriTTS: A Corpus Derived from LibriSpeech for Text-to-Speech},
  author = {H. Zen and V. Dang and R. Clark and Y. Zhang and R. J. Weiss and Y. Jia and Z. Chen and Y. Wu},
  booktitle = {Proc. Interspeech},
  month = sep,
  year = {2019},
  address = {Graz, Austria},
  doi = {10.21437/Interspeech.2019-2441},
  arxiv = {https://arxiv.org/abs/1904.02882},
  web = {http://www.openslr.org/60/}
}
@inproceedings{zhang2019learning,
  title = {Learning to Speak Fluently in a Foreign Language: Multilingual Speech Synthesis and Cross-Language Voice Cloning},
  author = {Y. Zhang and R. J. Weiss and H. Zen and Y. Wu and Z. Chen and R. J. Skerry-Ryan and Y. Jia and A. Rosenberg and B. Ramabhadran},
  booktitle = {Proc. Interspeech},
  month = sep,
  year = {2019},
  address = {Graz, Austria},
  doi = {10.21437/Interspeech.2019-2668},
  arxiv = {https://arxiv.org/abs/1907.04448},
  web = {https://google.github.io/tacotron/publications/multilingual}
}
@inproceedings{sainath2020joint,
  title = {An Attention-Based Joint Acoustic and Text on-Device End-To-End Model},
  author = {T. N. Sainath and R. Pang and R. J. Weiss and Y. He and C.-C. Chiu and T. Strohman},
  booktitle = {Proc. {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
  month = may,
  year = {2020},
  pages = {7039-7043},
  doi = {10.1109/ICASSP40776.2020.9053510},
  pdf = {pubs/icassp2020-jatd.pdf}
}
@inproceedings{sun2020hierarchical_prosody,
  title = {Fully-hierarchical fine-grained prosody modeling for interpretable speech synthesis},
  author = {G. Sun and Y. Zhang and R. J. Weiss and Y. Cao and H. Zen and Y. Wu},
  booktitle = {Proc. {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
  month = may,
  year = {2020},
  pages = {6264-6268},
  doi = {10.1109/ICASSP40776.2020.9053520},
  arxiv = {https://arxiv.org/abs/2002.03785},
  web = {https://google.github.io/tacotron/publications/hierarchical_prosody}
}
@inproceedings{sun2020prosody_prior,
  title = {Generating diverse and natural text-to-speech samples using a quantized fine-grained VAE and auto-regressive prosody prior},
  author = {G. Sun and Y. Zhang and R. J. Weiss and Y. Cao and H. Zen and A. Rosenberg and B. Ramabhadran and Y. Wu},
  booktitle = {Proc. {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
  month = may,
  year = {2020},
  pages = {6699-6703},
  doi = {10.1109/ICASSP40776.2020.9053436},
  arxiv = {https://arxiv.org/abs/2002.03788},
  web = {https://google.github.io/tacotron/publications/prosody_prior}
}
@inproceedings{wisdom2020speech_mixit,
  title = {Unsupervised Speech Separation Using Mixtures of Mixtures},
  author = {S. Wisdom and E. Tzinis and H. Erdogan and R. J. Weiss and K. Wilson and J. R. Hershey},
  booktitle = {ICML 2020 Workshop on Self-supervision in Audio and Speech},
  month = jul,
  year = {2020},
  reviews = {https://openreview.net/forum?id=qMMzJGRPT2d},
  web = {https://universal-sound-separation.github.io/unsupervised_speech_separation/}
}
@inproceedings{wisdom2020mixit,
  title = {Unsupervised Sound Separation Using Mixture Invariant Training},
  author = {S. Wisdom and E. Tzinis and H. Erdogan and R. J. Weiss and K. Wilson and J. R. Hershey},
  booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
  month = dec,
  year = {2020},
  arxiv = {https://arxiv.org/abs/2006.12701},
  reviews = {https://papers.nips.cc/paper/2020/hash/28538c394c36e4d5ea8ff5ad60562a93-Abstract.html},
  web = {https://universal-sound-separation.github.io/unsupervised_sound_separation/}
}
@inproceedings{chen2021wavegrad,
  title = {{WaveGrad}: Estimating Gradients for Waveform Generation},
  author = {N. Chen and Y. Zhang and H. Zen and R. J. Weiss and M. Norouzi and W. Chan},
  booktitle = {Proc. International Conference on Learning Representations ({ICLR})},
  month = may,
  year = {2021},
  arxiv = {https://arxiv.org/abs/2009.00713},
  reviews = {https://openreview.net/forum?id=NsMLjcFaO8O},
  web = {https://wavegrad.github.io/}
}
@inproceedings{elias2021parallel,
  title = {Parallel {Tacotron}: Non-Autoregressive and Controllable {TTS}},
  author = {I. Elias and H. Zen and J. Shen and Y. Zhang and Y. Jia and R. J. Weiss and Y. Wu},
  booktitle = {Proc. {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
  month = jun,
  year = {2021},
  doi = {10.1109/ICASSP39728.2021.9414718},
  arxiv = {https://arxiv.org/abs/2010.11439},
  web = {https://google.github.io/tacotron/publications/parallel_tacotron/}
}
@inproceedings{weiss2021wavetacotron,
  title = {Wave-Tacotron: Spectrogram-free end-to-end text-to-speech synthesis},
  author = {R. J. Weiss and R. J. Skerry-Ryan and E. Battenberg and S. Mariooryad and D. P. Kingma},
  booktitle = {Proc. {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
  month = jun,
  year = {2021},
  doi = {10.1109/ICASSP39728.2021.9413851},
  arxiv = {https://arxiv.org/abs/2011.03568},
  web = {https://google.github.io/tacotron/publications/wave-tacotron/},
  slides = {pubs/icassp2021-wavetaco-slides.pdf},
  poster = {pubs/icassp2021-wavetaco-poster.pdf},
  video = {https://www.youtube.com/watch?v=YqMywq_Eg_o}
}
@inproceedings{wang2021multitask,
  title = {Multitask Training with Text Data for End-to-End Speech Recognition},
  author = {P. Wang and T. N. Sainath and R. J. Weiss},
  booktitle = {Proc. Interspeech},
  month = aug,
  year = {2021},
  arxiv = {https://arxiv.org/abs/2010.14318}
}
@inproceedings{chen2021wavegrad2,
  title = {WaveGrad 2: Iterative Refinement for Text-to-Speech Synthesis},
  author = {N. Chen and Y. Zhang and H. Zen and R. J. Weiss and M. Norouzi and N. Dehak and W. Chan},
  booktitle = {Proc. Interspeech},
  month = aug,
  year = {2021},
  arxiv = {https://arxiv.org/abs/2106.09660},
  web = {https://wavegrad.github.io/v2}
}
@inproceedings{wisdom2021sparse,
  title = {Sparse, Efficient, and Semantic Mixture Invariant Training: Taming In-the-Wild Unsupervised Sound Separation},
  author = {S. Wisdom and A. Jansen and R. J. Weiss and H. Erdogan and J. R. Hershey},
  booktitle = {Proc. {IEEE} Workshop on Applications of Signal Processing to Audio and Acoustics ({WASPAA})},
  month = oct,
  year = {2021},
  arxiv = {https://arxiv.org/abs/2106.00847}
}