@article{WiegmannKerstenSenaratneetal., author = {Wiegmann, Matti and Kersten, Jens and Senaratne, Hansi and Potthast, Martin and Klan, Friederike and Stein, Benno}, title = {Opportunities and risks of disaster data from social media: a systematic review of incident information}, series = {Natural Hazards and Earth System Sciences}, volume = {2021}, journal = {Natural Hazards and Earth System Sciences}, number = {Volume 21, Issue 5}, publisher = {European Geophysical Society}, address = {Katlenburg-Lindau}, doi = {10.5194/nhess-21-1431-2021}, url = {http://nbn-resolving.de/urn:nbn:de:gbv:wim2-20210804-44634}, pages = {1431 -- 1444}, abstract = {Compiling and disseminating information about incidents and disasters are key to disaster management and relief. But due to inherent limitations of the acquisition process, the required information is often incomplete or missing altogether. To fill these gaps, citizen observations spread through social media are widely considered to be a promising source of relevant information, and many studies propose new methods to tap this resource. Yet, the overarching question of whether and under which circumstances social media can supply relevant information (both qualitatively and quantitatively) still remains unanswered. To shed some light on this question, we review 37 disaster and incident databases covering 27 incident types, compile a unified overview of the contained data and their collection processes, and identify the missing or incomplete information. The resulting data collection reveals six major use cases for social media analysis in incident data collection: (1) impact assessment and verification of model predictions, (2) narrative generation, (3) recruiting citizen volunteers, (4) supporting weakly institutionalized areas, (5) narrowing surveillance areas, and (6) reporting triggers for periodical surveillance. Furthermore, we discuss the benefits and shortcomings of using social media data for closing information gaps related to incidents and disasters.}, subject = {Katastrophe}, language = {en} } @phdthesis{Voelske, author = {V{\"o}lske, Michael}, title = {Retrieval Enhancements for Task-Based Web Search}, doi = {10.25643/bauhaus-universitaet.3942}, url = {http://nbn-resolving.de/urn:nbn:de:gbv:wim2-20190709-39422}, school = {Bauhaus-Universit{\"a}t Weimar}, abstract = {The task-based view of web search implies that retrieval should take the user perspective into account. Going beyond merely retrieving the most relevant result set for the current query, the retrieval system should aim to surface results that are actually useful to the task that motivated the query. This dissertation explores how retrieval systems can better understand and support their users' tasks from three main angles: First, we study and quantify search engine user behavior during complex writing tasks, and how task success and behavior are associated in such settings. Second, we investigate search engine queries formulated as questions, and explore patterns in a large query log that may help search engines to better support this increasingly prevalent interaction pattern. Third, we propose a novel approach to reranking the search result lists produced by web search engines, taking into account retrieval axioms that formally specify properties of a good ranking.}, subject = {Information Retrieval}, language = {en} } @article{VakkariVoelskePotthastetal., author = {Vakkari, Pertti and V{\"o}lske, Michael and Potthast, Martin and Hagen, Matthias and Stein, Benno}, title = {Predicting essay quality from search and writing behavior}, series = {Journal of Association for Information Science and Technology}, volume = {2021}, journal = {Journal of Association for Information Science and Technology}, number = {volume 72, issue 7}, publisher = {Wiley}, address = {Hoboken, NJ}, doi = {10.1002/asi.24451}, url = {http://nbn-resolving.de/urn:nbn:de:gbv:wim2-20210804-44692}, pages = {839 -- 852}, abstract = {Few studies have investigated how search behavior affects complex writing tasks. We analyze a dataset of 150 long essays whose authors searched the ClueWeb09 corpus for source material, while all querying, clicking, and writing activity was meticulously recorded. We model the effect of search and writing behavior on essay quality using path analysis. Since the boil-down and build-up writing strategies identified in previous research have been found to affect search behavior, we model each writing strategy separately. Our analysis shows that the search process contributes significantly to essay quality through both direct and mediated effects, while the author's writing strategy moderates this relationship. Our models explain 25-35\% of the variation in essay quality through rather simple search and writing process characteristics alone, a fact that has implications on how search engines could personalize result pages for writing tasks. Authors' writing strategies and associated searching patterns differ, producing differences in essay quality. In a nutshell: essay quality improves if search and writing strategies harmonizeā€”build-up writers benefit from focused, in-depth querying, while boil-down writers fare better with a broader and shallower querying strategy.}, subject = {Information Retrieval}, language = {en} } @phdthesis{Potthast, author = {Potthast, Martin}, title = {Technologies for Reusing Text from the Web}, doi = {10.25643/bauhaus-universitaet.1566}, url = {http://nbn-resolving.de/urn:nbn:de:gbv:wim2-20120217-15663}, school = {Bauhaus-Universit{\"a}t Weimar}, pages = {237}, abstract = {Texts from the web can be reused individually or in large quantities. The former is called text reuse and the latter language reuse. We first present a comprehensive overview of the different ways in which text and language is reused today, and how exactly information retrieval technologies can be applied in this respect. The remainder of the thesis then deals with specific retrieval tasks. In general, our contributions consist of models and algorithms, their evaluation, and for that purpose, large-scale corpus construction. The thesis divides into two parts. The first part introduces technologies for text reuse detection, and our contributions are as follows: (1) A unified view of projecting-based and embedding-based fingerprinting for near-duplicate detection and the first time evaluation of fingerprint algorithms on Wikipedia revision histories as a new, large-scale corpus of near-duplicates. (2) A new retrieval model for the quantification of cross-language text similarity, which gets by without parallel corpora. We have evaluated the model in comparison to other models on many different pairs of languages. (3) An evaluation framework for text reuse and particularly plagiarism detectors, which consists of tailored detection performance measures and a large-scale corpus of automatically generated and manually written plagiarism cases. The latter have been obtained via crowdsourcing. This framework has been successfully applied to evaluate many different state-of-the-art plagiarism detection approaches within three international evaluation competitions. The second part introduces technologies that solve three retrieval tasks based on language reuse, and our contributions are as follows: (4) A new model for the comparison of textual and non-textual web items across media, which exploits web comments as a source of information about the topic of an item. In this connection, we identify web comments as a largely neglected information source and introduce the rationale of comment retrieval. (5) Two new algorithms for query segmentation, which exploit web n-grams and Wikipedia as a means of discerning the user intent of a keyword query. Moreover, we crowdsource a new corpus for the evaluation of query segmentation which surpasses existing corpora by two orders of magnitude. (6) A new writing assistance tool called Netspeak, which is a search engine for commonly used language. Netspeak indexes the web in the form of web n-grams as a source of writing examples and implements a wildcard query processor on top of it.}, subject = {Information Retrieval}, language = {en} } @phdthesis{Lipka, author = {Lipka, Nedim}, title = {Modeling Non-Standard Text Classification Tasks}, doi = {10.25643/bauhaus-universitaet.1862}, url = {http://nbn-resolving.de/urn:nbn:de:gbv:wim2-20130307-18626}, school = {Bauhaus-Universit{\"a}t Weimar}, abstract = {Text classification deals with discovering knowledge in texts and is used for extracting, filtering, or retrieving information in streams and collections. The discovery of knowledge is operationalized by modeling text classification tasks, which is mainly a human-driven engineering process. The outcome of this process, a text classification model, is used to inductively learn a text classification solution from a priori classified examples. The building blocks of modeling text classification tasks cover four aspects: (1) the way examples are represented, (2) the way examples are selected, (3) the way classifiers learn from examples, and (4) the way models are selected. This thesis proposes methods that improve the prediction quality of text classification solutions for unseen examples, especially for non-standard tasks where standard models do not fit. The original contributions are related to the aforementioned building blocks: (1) Several topic-orthogonal text representations are studied in the context of non-standard tasks and a new representation, namely co-stems, is introduced. (2) A new active learning strategy that goes beyond standard sampling is examined. (3) A new one-class ensemble for improving the effectiveness of one-class classification is proposed. (4) A new model selection framework to cope with subclass distribution shifts that occur in dynamic environments is introduced.}, subject = {Text Classification}, language = {en} } @masterthesis{Lang, type = {Bachelor Thesis}, author = {Lang, Kevin}, title = {Worteinbettung als semantisches Feature in der argumentativen Analyse}, doi = {10.25643/bauhaus-universitaet.3934}, url = {http://nbn-resolving.de/urn:nbn:de:gbv:wim2-20190617-39343}, school = {Bauhaus-Universit{\"a}t Weimar}, pages = {54}, abstract = {Diese Arbeit besch{\"a}ftigt sich mit der Nutzung von Worteinbettungen in der automatischen Analyse von argumentativen Texten. Die Arbeit diskutiert wichtige Einstellungen des Einbettungsverfahren sowie diverse Anwendungsmethoden der eingebetteten Wortvektoren f{\"u}r drei Aufgaben der automatischen argumentativen Analyse: Textsegmentierung, Argumentativit{\"a}ts-Klassifikation und Relationenfindung. Meine Experimente auf zwei Standard-Argumentationsdatens{\"a}tzen zeigen die folgenden Haupterkenntnisse: Bei der Textsegmentierung konnten keine Verbesserungen erzielt werden, w{\"a}hrend in der Argumentativit{\"a}ts-Klassifikation und der Relationenfindung sich kleine Erfolge gezeigt haben und weitere bestimmte Forschungsthesen bewahrheitet werden konnten. In der Diskussion wird darauf eingegangen, warum bei der einfachen Worteinbettung in der argumentativen Analyse sich kaum nutzbare Ergebnisse erzielen lassen konnten, diese sich aber in Zukunft durch erweiterte Worteinbettungsverfahren verbessern k{\"o}nnen.}, subject = {Argumentation}, language = {de} } @misc{Lang, type = {Master Thesis}, author = {Lang, Kevin}, title = {Argument Search with Voice Assistants}, doi = {10.25643/bauhaus-universitaet.3935}, url = {http://nbn-resolving.de/urn:nbn:de:gbv:wim2-20190617-39353}, school = {Bauhaus-Universit{\"a}t Weimar}, pages = {100}, abstract = {The need for finding persuasive arguments can arise in a variety of domains such as politics, finance, marketing or personal entertainment. In these domains, there is a demand to make decisions by oneself or to convince somebody about a specific topic. To obtain a conclusion, one has to search thoroughly different sources in literature and on the web to compare various arguments. Voice interfaces, in form of smartphone applications or smart speakers, present the user with natural conversations in a comfortable way to make search requests in contrast to a traditional search interface with keyboard and display. Benefits and obstacles of such a new interface are analyzed by conducting two studies. The first one consists of a survey for analyzing the target group with questions about situations, motivations, and possible demanding features. The latter one is a wizard-of-oz experiment to investigate possible queries on how a user formulates requests to such a novel system. The results indicate that a search interface with conversational abilities can build a helpful assistant, but to satisfy the demands of a broader audience some additional information retrieval and visualization features need to be implemented.}, subject = {Amazon Alexa}, language = {en} } @phdthesis{Kiesel2022, author = {Kiesel, Johannes}, title = {Harnessing Web Archives to Tackle Selected Societal Challenges}, doi = {10.25643/bauhaus-universitaet.4660}, url = {http://nbn-resolving.de/urn:nbn:de:gbv:wim2-20220622-46602}, school = {Bauhaus-Universit{\"a}t Weimar}, year = {2022}, abstract = {With the growing importance of the World Wide Web, the major challenges our society faces are also increasingly affecting the digital areas of our lives. Some of the associated problems can be addressed by computer science, and some of these specifically by data-driven research. To do so, however, requires to solve open issues related to archive quality and the large volume and variety of the data contained. This dissertation contributes data, algorithms, and concepts towards leveraging the big data and temporal provenance capabilities of web archives to tackle societal challenges. We selected three such challenges that highlight the central issues of archive quality, data volume, and data variety, respectively: (1) For the preservation of digital culture, this thesis investigates and improves the automatic quality assurance of the web page archiving process, as well as the further processing of the resulting archive data for automatic analysis. (2) For the critical assessment of information, this thesis examines large datasets of Wikipedia and news articles and presents new methods for automatically determining quality and bias. (3) For digital security and privacy, this thesis exploits the variety of content on the web to quantify the security of mnemonic passwords and analyzes the privacy-aware re-finding of the various seen content through private web archives.}, subject = {Informatik}, language = {en} } @phdthesis{Gollub, author = {Gollub, Tim}, title = {Information Retrieval for the Digital Humanities}, doi = {10.25643/bauhaus-universitaet.4673}, url = {http://nbn-resolving.de/urn:nbn:de:gbv:wim2-20220801-46738}, school = {Bauhaus-Universit{\"a}t Weimar}, pages = {177}, abstract = {In ten chapters, this thesis presents information retrieval technology which is tailored to the research activities that arise in the context of corpus-based digital humanities projects. The presentation is structured by a conceptual research process that is introduced in Chapter 1. The process distinguishes a set of five research activities: research question generation, corpus acquisition, research question modeling, corpus annotation, and result dissemination. Each of these research activities elicits different information retrieval tasks with special challenges, for which algorithmic approaches are presented after an introduction of the core information retrieval concepts in Chapter 2. A vital concept in many of the presented approaches is the keyquery paradigm introduced in Chapter 3, which represents an operation that returns relevant search queries in response to a given set of input documents. Keyqueries are proposed in Chapter 4 for the recommendation of related work, and in Chapter 5 for improving access to aspects hidden in the long tail of search result lists. With pseudo-descriptions, a document expansion approach is presented in Chapter 6. The approach improves the retrieval performance for corpora where only bibliographic meta-data is originally available. In Chapter 7, the keyquery paradigm is employed to generate dynamic taxonomies for corpora in an unsupervised fashion. Chapter 8 turns to the exploration of annotated corpora, and presents scoped facets as a conceptual extension to faceted search systems, which is particularly useful in exploratory search settings. For the purpose of highlighting the major topical differences in a sequence of sub-corpora, an algorithm called topical sequence profiling is presented in Chapter 9. The thesis concludes with two pilot studies regarding the visualization of (re)search results for the means of successful result dissemination: a metaphoric interpretation of the information nutrition label, as well as the philosophical bodies, which are 3D-printed search results.}, subject = {Information Retrieval}, language = {en} } @phdthesis{Bunte, author = {Bunte, Andreas}, title = {Entwicklung einer ontologiebasierten Beschreibung zur Erh{\"o}hung des Automatisierungsgrades in der Produktion}, doi = {10.25643/bauhaus-universitaet.4315}, url = {http://nbn-resolving.de/urn:nbn:de:gbv:wim2-20201215-43156}, school = {Bauhaus-Universit{\"a}t Weimar}, pages = {189}, abstract = {Die zu beobachtenden k{\"u}rzeren Produktlebenszyklen und eine schnellere Marktdurchdringung von Produkttechnologien erfordern adaptive und leistungsf{\"a}hige Produktionsanlagen. Die Adaptivit{\"a}t erm{\"o}glicht eine Anpassung der Produktionsanlage an neue Produkte, und die Leistungsf{\"a}higkeit der Anlage stellt sicher, dass ausreichend Produkte in kurzer Zeit und zu geringen Kosten hergestellt werden k{\"o}nnen. Durch eine Modularisierung der Produktionsanlage kann die Adaptivit{\"a}t erreicht werden. Jedoch erfordert heutzutage jede Adaption manuellen Aufwand, z.B. zur Anpassung von propriet{\"a}ren Signalen oder zur Anpassung {\"u}bergeordneter Funktionen. Dadurch sinkt die Leistungsf{\"a}higkeit der Anlage. Das Ziel dieser Arbeit ist es, die Interoperabilit{\"a}t in Bezug auf die Informationsverwendung in modularen Produktionsanlagen zu gew{\"a}hrleisten. Dazu werden Informationen durch semantische Modelle beschrieben. Damit wird ein einheitlicher Informationszugriff erm{\"o}glicht, und {\"u}bergeordnete Funktionen erhalten Zugriff auf alle Informationen der Produktionsmodule, unabh{\"a}ngig von dem Typ, dem Hersteller und dem Alter des Moduls. Dadurch entf{\"a}llt der manuelle Aufwand bei Anpassungen des modularen Produktionssystems, wodurch die Leistungsf{\"a}higkeit der Anlage gesteigert und Stillstandszeiten reduziert werden. Nach dem Ermitteln der Anforderungen an einen Modellierungsformalismus wurden potentielle Formalismen mit den Anforderungen abgeglichen. OWL DL stellte sich als geeigneter Formalismus heraus und wurde f{\"u}r die Erstellung des semantischen Modells in dieser Arbeit verwendet. Es wurde exemplarisch ein semantisches Modell f{\"u}r die drei Anwendungsf{\"a}lle Interaktion, Orchestrierung und Diagnose erstellt. Durch einen Vergleich der Modellierungselemente von unterschiedlichen Anwendungsf{\"a}llen wurde die Allgemeing{\"u}ltigkeit des Modells bewertet. Dabei wurde gezeigt, dass die Erreichung eines allgemeinen Modells f{\"u}r technische Anwendungsf{\"a}lle m{\"o}glich ist und lediglich einige Hundert Begriffe ben{\"o}tigt. Zur Evaluierung der erstellten Modelle wurde ein wandlungsf{\"a}higes Produktionssystem der SmartFactoryOWL verwendet, an dem die Anwendungsf{\"a}lle umgesetzt wurden. Dazu wurde eine Laufzeitumgebung erstellt, die die semantischen Modelle der einzelnen Module zu einem Gesamtmodell vereint, Daten aus der Anlage in das Modell {\"u}bertr{\"a}gt und eine Schnittstelle f{\"u}r die Services bereitstellt. Die Services realisieren {\"u}bergeordnete Funktionen und verwenden die Informationen des semantischen Modells. In allen drei Anwendungsf{\"a}llen wurden die semantischen Modelle korrekt zusammengef{\"u}gt und mit den darin enthaltenen Informationen konnte die Aufgabe des jeweiligen Anwendungsfalles ohne zus{\"a}tzlichen manuellen Aufwand gel{\"o}st werden.}, subject = {Ontologie}, language = {de} }