@article{schoch2020abgeleitete, title = {Abgeleitete {T}extformate: {P}rinzip und {B}eispiele}, author = {Sch{\"o}ch, Christof and D{\"o}hl, Fr{\'e}d{\'e}ric and Rettinger, Achim and Gius, Evelyn and Trilcke, Peer and Leinen, Peter and Jannidis, Fotis and Hinzmann, Maria and R{\"o}pke, J{\"o}rg}, year = 2020, journal = {RuZ-Recht und Zugang}, volume = 1, number = 2, doi = {10.5771/2699-1284-2020-2-160} } @article{schoch2020abgeleitete2, title = {Abgeleitete {T}extformate: {T}ext und {D}ata {M}ining mit urheberrechtlich geschützten {T}extbeständen}, author = {Sch{\"o}ch, Christof and D{\"o}hl, Fr{\'e}d{\'e}ric and Rettinger, Achim and Gius, Evelyn and Trilcke, Peer and Leinen, Peter and Jannidis, Fotis and Hinzmann, Maria and R{\"o}pke, J{\"o}rg}, year = 2020, journal = {Zeitschrift f{\"u}r digitale Geisteswissenschaften}, doi = {10.17175/2020_006} } @inproceedings{wang2018glue, title = {{GLUE:} {A} Multi-Task Benchmark and Analysis Platform for Natural Language Understanding}, author = {Alex Wang and Amanpreet Singh and Julian Michael and Felix Hill and Omer Levy and Samuel R. Bowman}, year = 2019, booktitle = {7th International Conference on Learning Representations, {ICLR} 2019, New Orleans, LA, USA, May 6-9, 2019}, doi = {10.18653/v1/W18-5446"} } @inproceedings{wang2019superglue, title = {SuperGLUE: {A} Stickier Benchmark for General-Purpose Language Understanding Systems}, author = {Alex Wang and Yada Pruksachatkun and Nikita Nangia and Amanpreet Singh and Julian Michael and Felix Hill and Omer Levy and Samuel R. Bowman}, year = 2019, booktitle = {Advances in Neural Information Processing Systems 32: Annual Conference on Neural Information Processing Systems 2019, NeurIPS 2019, December 8-14, 2019, Vancouver, BC, Canada}, doi = {10.5555/3454287.3454581}, editor = {Hanna M. Wallach and Hugo Larochelle and Alina Beygelzimer and Florence d'Alch{\'{e}}{-}Buc and Emily B. Fox and Roman Garnett} } @misc{he2020deberta, title = {DeBERTa: Decoding-enhanced BERT with Disentangled Attention}, author = {He, Pengcheng and Liu, Xiaodong and Gao, Jianfeng and Chen, Weizhu}, year = 2020, copyright = {arXiv.org perpetual, non-exclusive license} } @inproceedings{pan2020privacy, title = {Privacy risks of general-purpose language models}, author = {Pan, Xudong and Zhang, Mi and Ji, Shouling and Yang, Min}, year = 2020, booktitle = {2020 IEEE Symposium on Security and Privacy (SP)}, doi = {10.1109/SP40000.2020.00095}, organization = {IEEE} } @inproceedings{carlini2020extracting, title = {Extracting Training Data from Large Language Models}, author = {Nicholas Carlini and Florian Tram{\`e}r and Eric Wallace and Matthew Jagielski and Ariel Herbert-Voss and Katherine Lee and Adam Roberts and Tom B. Brown and Dawn Xiaodong Song and {\'U}lfar Erlingsson and Alina Oprea and Colin Raffel}, year = 2021, booktitle = {USENIX Security Symposium}, url = {https://www.usenix.org/system/files/sec21-carlini-extracting.pdf} } @inproceedings{bender2021dangers, title = {On the Dangers of Stochastic Parrots: Can Language Models Be Too Big?}, author = {Bender, Emily M and Gebru, Timnit and McMillan-Major, Angelina and Shmitchell, Shmargaret}, year = 2021, booktitle = {Proceedings of the 2021 ACM Conference on Fairness, Accountability, and Transparency}, doi = {10.1145/3442188.3445922} } @inproceedings{devlin2018bert, title = {{BERT}: Pre-training of Deep Bidirectional Transformers for Language Understanding}, author = {Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina}, year = 2019, booktitle = {Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)}, doi = {10.18653/v1/N19-1423} } @inproceedings{vaswani2017attention, title = {Attention is All you Need}, author = {Ashish Vaswani and Noam Shazeer and Niki Parmar and Jakob Uszkoreit and Llion Jones and Aidan N. Gomez and Lukasz Kaiser and Illia Polosukhin}, year = 2017, booktitle = {Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, December 4-9, 2017, Long Beach, CA, {USA}}, editor = {Isabelle Guyon and Ulrike von Luxburg and Samy Bengio and Hanna M. Wallach and Rob Fergus and S. V. N. Vishwanathan and Roman Garnett} } @article{liu2019roberta, title = {RoBERTa: A Robustly Optimized BERT Pretraining Approach}, author = {Yinhan Liu and Myle Ott and Naman Goyal and Jingfei Du and Mandar Joshi and Danqi Chen and Omer Levy and Mike Lewis and Luke Zettlemoyer and Veselin Stoyanov}, year = 2019, journal = {ArXiv}, volume = {abs/1907.11692} } @inproceedings{brown2020language, title = {Language Models are Few-Shot Learners}, author = {Tom B. Brown and Benjamin Mann and Nick Ryder and Melanie Subbiah and Jared Kaplan and Prafulla Dhariwal and Arvind Neelakantan and Pranav Shyam and Girish Sastry and Amanda Askell and Sandhini Agarwal and Ariel Herbert{-}Voss and Gretchen Krueger and Tom Henighan and Rewon Child and Aditya Ramesh and Daniel M. Ziegler and Jeffrey Wu and Clemens Winter and Christopher Hesse and Mark Chen and Eric Sigler and Mateusz Litwin and Scott Gray and Benjamin Chess and Jack Clark and Christopher Berner and Sam McCandlish and Alec Radford and Ilya Sutskever and Dario Amodei}, year = 2020, booktitle = {Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6-12, 2020, virtual}, publisher = {Curran Associates, Inc.}, volume = 33, pages = {1877--1901}, url = {https://proceedings.neurips.cc/paper_files/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf}, editor = {Hugo Larochelle and Marc'Aurelio Ranzato and Raia Hadsell and Maria{-}Florina Balcan and Hsuan{-}Tien Lin} } @inproceedings{song2020information, title = {Information leakage in embedding models}, author = {Song, Congzheng and Raghunathan, Ananth}, year = 2020, booktitle = {Proceedings of the 2020 ACM SIGSAC Conference on Computer and Communications Security}, doi = {10.1145/3372297.3417270} } @inproceedings{Thomas2020InvestigatingTI, title = {Investigating the Impact of Pre-Trained Word Embeddings on Memorization in Neural Networks}, author = {Thomas, Aleena and Adelani, David Ifeoluwa and Davody, Ali and Mogadala, Aditya and Klakow, Dietrich}, year = 2020, booktitle = {Text, Speech, and Dialogue: 23rd International Conference, TSD 2020, Brno, Czech Republic, September 8–11, 2020}, location = {Brno, Czech Republic}, publisher = {Springer}, address = {Berlin, Heidelberg}, pages = {273–281}, doi = {10.1007/978-3-030-58323-1_30}, numpages = 9 } @inproceedings{song2019auditing, title = {Auditing Data Provenance in Text-Generation Models}, author = {Congzheng Song and Vitaly Shmatikov}, year = 2019, booktitle = {Proceedings of the 25th {ACM} {SIGKDD} International Conference on Knowledge Discovery {\&} Data Mining, {KDD} 2019, Anchorage, AK, USA, August 4-8, 2019}, doi = {10.1145/3292500.3330885}, editor = {Ankur Teredesai and Vipin Kumar and Ying Li and R{\'{o}}mer Rosales and Evimaria Terzi and George Karypis} } @inproceedings{press2016using, title = {Using the Output Embedding to Improve Language Models}, author = {Press, Ofir and Wolf, Lior}, year = 2017, booktitle = {Proceedings of the 15th Conference of the {E}uropean Chapter of the Association for Computational Linguistics: Volume 2, Short Papers} } @inproceedings{inan2016tying, title = {Tying Word Vectors and Word Classifiers: {A} Loss Framework for Language Modeling}, author = {Hakan Inan and Khashayar Khosravi and Richard Socher}, year = 2017, booktitle = {5th International Conference on Learning Representations, {ICLR} 2017, Toulon, France, April 24-26, 2017, Conference Track Proceedings} } @article{raffel2019exploring, title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer}, author = {Colin Raffel and Noam M. Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu}, year = 2020, journal = {ArXiv}, doi = {10.48550/arXiv.1910.10683} } @inproceedings{pennington2014glove, title = {{G}lo{V}e: Global Vectors for Word Representation}, author = {Pennington, Jeffrey and Socher, Richard and Manning, Christopher}, year = 2014, booktitle = {Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing ({EMNLP})}, doi = {10.3115/v1/D14-1162} } @inproceedings{mikolov2013efficient, title = {Efficient Estimation of Word Representations in Vector Space}, author = {Tom{\'{a}}s Mikolov and Kai Chen and Greg Corrado and Jeffrey Dean}, year = 2013, booktitle = {1st International Conference on Learning Representations, {ICLR} 2013, Scottsdale, Arizona, USA, May 2-4, 2013, Workshop Track Proceedings}, doi = {10.48550/arXiv.1301.3781}, editor = {Yoshua Bengio and Yann LeCun} } @inproceedings{shokri2017membership, title = {Membership inference attacks against machine learning models}, author = {Shokri, Reza and Stronati, Marco and Song, Congzheng and Shmatikov, Vitaly}, year = 2017, booktitle = {2017 IEEE Symposium on Security and Privacy (SP)}, doi = {10.1109/TDSC.2022.3180828}, organization = {IEEE} } @inproceedings{melis2019exploiting, title = {Exploiting unintended feature leakage in collaborative learning}, author = {Melis, Luca and Song, Congzheng and De Cristofaro, Emiliano and Shmatikov, Vitaly}, year = 2019, booktitle = {2019 IEEE Symposium on Security and Privacy (SP)}, doi = {10.1109/SP.2019.00029}, organization = {IEEE} } @inproceedings{krishna2019thieves, title = {Thieves on Sesame Street! Model Extraction of BERT-based APIs}, author = {Kalpesh Krishna and Gaurav Singh Tomar and Ankur P. Parikh and Nicolas Papernot and Mohit Iyyer}, year = 2020, booktitle = {8th International Conference on Learning Representations, {ICLR} 2020, Addis Ababa, Ethiopia, April 26-30, 2020}, doi = {https://doi.org/10.48550/arXiv.1910.12366} } @book{rowling1998stone, title = {Harry Potter and the Sorcerer's Stone}, author = {Rowling, Joan K.}, year = 1998 } @book{rowling2000goblet, title = {Harry Potter and the Goblet of Fire}, author = {Rowling, J. K.}, year = 2000, publisher = {Bloomsbury} } @book{rowling2006princce, title = {Harry Potter and the Half-Blood Prince}, author = {Rowling, J. K.}, year = 2006, publisher = {Bloomsbury} } @inproceedings{Wolf2020, title = {Transformers: State-of-the-Art Natural Language Processing}, author = {Wolf, Thomas and Debut, Lysandre and Sanh, Victor and Chaumond, Julien and Delangue, Clement and Moi, Anthony and Cistac, Pierric and Rault, Tim and Louf, Remi and Funtowicz, Morgan and Davison, Joe and Shleifer, Sam and von Platen, Patrick and Ma, Clara and Jernite, Yacine and Plu, Julien and Xu, Canwen and Le Scao, Teven and Gugger, Sylvain and Drame, Mariama and Lhoest, Quentin and Rush, Alexander}, year = 2020, booktitle = {Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations} } @inproceedings{Papineni2002, title = {{B}leu: a Method for Automatic Evaluation of Machine Translation}, author = {Papineni, Kishore and Roukos, Salim and Ward, Todd and Zhu, Wei-Jing}, year = 2002, booktitle = {Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics}, doi = {10.3115/1073083.1073135} } @inproceedings{Loper2002, title = {{NLTK}: The Natural Language Toolkit}, author = {Loper, Edward and Bird, Steven}, year = 2002, booktitle = {Proceedings of the {ACL}-02 Workshop on Effective Tools and Methodologies for Teaching Natural Language Processing and Computational Linguistics} } @article{rigaki2020, title = {A Survey of Privacy Attacks in Machine Learning}, author = {Maria Rigaki and Sebastian Garcia}, year = 2020, journal = {arXiv}, doi = {10.48550/arXiv.2007.07646} } @article{mahloujifar2021, title = {Membership Inference on Word Embedding and Beyond}, author = {Saeed Mahloujifar and Huseyin A. Inan and Melissa Chase and Esha Ghosh and Marcello Hasegawa}, year = 2021, journal = {ArXiv}, doi = {https://doi.org/10.48550/arXiv.2106.11384} } @article{hisamoto2020, title = {Membership Inference Attacks on Sequence-to-Sequence Models: {I}s My Data In Your Machine Translation System?}, author = {Hisamoto, Sorami and Post, Matt and Duh, Kevin}, year = 2020, journal = {Transactions of the Association for Computational Linguistics}, publisher = {MIT Press}, address = {Cambridge, MA}, volume = 8, pages = {49--63}, doi = {10.1162/tacl_a_00299}, url = {https://aclanthology.org/2020.tacl-1.4} } @inproceedings{vulic2020, title = {Probing Pretrained Language Models for Lexical Semantics}, author = {Ivan Vulic and Edoardo Maria Ponti and Robert Litschko and Goran Glavas and Anna Korhonen}, year = 2020, booktitle = {Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing, {EMNLP} 2020, Online, November 16-20, 2020}, publisher = {Association for Computational Linguistics}, pages = {7222--7240}, doi = {10.18653/v1/2020.emnlp-main.586}, editor = {Bonnie Webber and Trevor Cohn and Yulan He and Yang Liu}, groups = {lexical semantics} } @article{wilkinson2016fair, title = {The FAIR Guiding Principles for scientific data management and stewardship}, author = {Wilkinson, Mark D and Dumontier, Michel and Aalbersberg, IJsbrand Jan and Appleton, Gabrielle and Axton, Myles and Baak, Arie and Blomberg, Niklas and Boiten, Jan-Willem and da Silva Santos, Luiz Bonino and Bourne, Philip E and others}, year = 2016, journal = {Scientific data}, publisher = {Nature Publishing Group}, volume = 3, doi = {10.1038/sdata.2016.18} }