@phdthesis{Kiesel2022, author = {Kiesel, Johannes}, title = {Harnessing Web Archives to Tackle Selected Societal Challenges}, doi = {10.25643/bauhaus-universitaet.4660}, url = {http://nbn-resolving.de/urn:nbn:de:gbv:wim2-20220622-46602}, school = {Bauhaus-Universit{\"a}t Weimar}, year = {2022}, abstract = {With the growing importance of the World Wide Web, the major challenges our society faces are also increasingly affecting the digital areas of our lives. Some of the associated problems can be addressed by computer science, and some of these specifically by data-driven research. To do so, however, requires to solve open issues related to archive quality and the large volume and variety of the data contained. This dissertation contributes data, algorithms, and concepts towards leveraging the big data and temporal provenance capabilities of web archives to tackle societal challenges. We selected three such challenges that highlight the central issues of archive quality, data volume, and data variety, respectively: (1) For the preservation of digital culture, this thesis investigates and improves the automatic quality assurance of the web page archiving process, as well as the further processing of the resulting archive data for automatic analysis. (2) For the critical assessment of information, this thesis examines large datasets of Wikipedia and news articles and presents new methods for automatically determining quality and bias. (3) For digital security and privacy, this thesis exploits the variety of content on the web to quantify the security of mnemonic passwords and analyzes the privacy-aware re-finding of the various seen content through private web archives.}, subject = {Informatik}, language = {en} } @phdthesis{Gollub, author = {Gollub, Tim}, title = {Information Retrieval for the Digital Humanities}, doi = {10.25643/bauhaus-universitaet.4673}, url = {http://nbn-resolving.de/urn:nbn:de:gbv:wim2-20220801-46738}, school = {Bauhaus-Universit{\"a}t Weimar}, pages = {177}, abstract = {In ten chapters, this thesis presents information retrieval technology which is tailored to the research activities that arise in the context of corpus-based digital humanities projects. The presentation is structured by a conceptual research process that is introduced in Chapter 1. The process distinguishes a set of five research activities: research question generation, corpus acquisition, research question modeling, corpus annotation, and result dissemination. Each of these research activities elicits different information retrieval tasks with special challenges, for which algorithmic approaches are presented after an introduction of the core information retrieval concepts in Chapter 2. A vital concept in many of the presented approaches is the keyquery paradigm introduced in Chapter 3, which represents an operation that returns relevant search queries in response to a given set of input documents. Keyqueries are proposed in Chapter 4 for the recommendation of related work, and in Chapter 5 for improving access to aspects hidden in the long tail of search result lists. With pseudo-descriptions, a document expansion approach is presented in Chapter 6. The approach improves the retrieval performance for corpora where only bibliographic meta-data is originally available. In Chapter 7, the keyquery paradigm is employed to generate dynamic taxonomies for corpora in an unsupervised fashion. Chapter 8 turns to the exploration of annotated corpora, and presents scoped facets as a conceptual extension to faceted search systems, which is particularly useful in exploratory search settings. For the purpose of highlighting the major topical differences in a sequence of sub-corpora, an algorithm called topical sequence profiling is presented in Chapter 9. The thesis concludes with two pilot studies regarding the visualization of (re)search results for the means of successful result dissemination: a metaphoric interpretation of the information nutrition label, as well as the philosophical bodies, which are 3D-printed search results.}, subject = {Information Retrieval}, language = {en} }