/ Introduction
Language technologies are at the core of media technologies. This work package aims to provide datasets and models for Norwegian (Bokmål/Nynorsk) that support the automated understanding as well as the automated production of media texts in this language.
Objective: WP5 adopts theoretical approaches and methodologies primarily based on linguistic data science, including neural learning. Based on language data in the media from the user partners and data and tools at the research partners, large corpora will be annotated. The labelled examples in these corpora will be used for training and evaluating supervised models that demonstrate advanced approaches in areas such as robust deep language analysis, adaptive language generation, event identification and extraction, and analyzing opinions. The partners will cooperate to explore the use of such models for innovative purposes.
/ Introduction
Language technologies are at the core of media technologies. This work package aims to provide datasets and models for Norwegian (Bokmål/Nynorsk) that support the automated understanding as well as the automated production of media texts in this language.
Objective: WP5 adopts theoretical approaches and methodologies primarily based on linguistic data science, including neural learning. Based on language data in the media from the user partners and data and tools at the research partners, large corpora will be annotated. The labelled examples in these corpora will be used for training and evaluating supervised models that demonstrate advanced approaches in areas such as robust deep language analysis, adaptive language generation, event identification and extraction, and analyzing opinions. The partners will cooperate to explore the use of such models for innovative purposes.
/ Introduction
Language technologies are at the core of media technologies. This work package aims to provide datasets and models for Norwegian (Bokmål/Nynorsk) that support the automated understanding as well as the automated production of media texts in this language.
Objective: WP5 adopts theoretical approaches and methodologies primarily based on linguistic data science, including neural learning. Based on language data in the media from the user partners and data and tools at the research partners, large corpora will be annotated. The labelled examples in these corpora will be used for training and evaluating supervised models that demonstrate advanced approaches in areas such as robust deep language analysis, adaptive language generation, event identification and extraction, and analyzing opinions. The partners will cooperate to explore the use of such models for innovative purposes.
/ People






/ Publications
2022
Samia Touileb; Debora Nozza
Measuring Harmful Representations in Scandinavian Language Models Conference
2022.
@conference{Touileb2022b,
title = {Measuring Harmful Representations in Scandinavian Language Models},
author = {Samia Touileb and Debora Nozza},
url = {https://mediafutures.no/2211-11678/},
year = {2022},
date = {2022-11-21},
urldate = {2022-11-21},
abstract = {Scandinavian countries are perceived as rolemodels when it comes to gender equality. With the advent of pre-trained language models and their widespread usage, we investigate to what extent gender-based harmful and toxic content exist in selected Scandinavian language models. We examine nine models, covering Danish, Swedish, and Norwegian, by manually creating template-based sentences and probing
the models for completion. We evaluate the completions using two methods for measuring harmful and toxic completions and provide a thorough analysis of the results. We show that Scandinavian pre-trained language models contain harmful and gender-based stereotypes with similar values across all languages.
This finding goes against the general expectations related to gender equality in Scandinavian countries and shows the possible problematic outcomes of using such models in real world settings.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
the models for completion. We evaluate the completions using two methods for measuring harmful and toxic completions and provide a thorough analysis of the results. We show that Scandinavian pre-trained language models contain harmful and gender-based stereotypes with similar values across all languages.
This finding goes against the general expectations related to gender equality in Scandinavian countries and shows the possible problematic outcomes of using such models in real world settings.
Petter Mæhlum, Andre Kåsen, Samia Touileb, Jeremy Barnes
Annotating Norwegian language varieties on Twitter for Part-of-speech Workshop
2022.
@workshop{Mæhlum2022,
title = {Annotating Norwegian language varieties on Twitter for Part-of-speech},
author = {Petter Mæhlum, Andre Kåsen, Samia Touileb, Jeremy Barnes},
url = {https://mediafutures.no/2022-vardial-1-7/},
year = {2022},
date = {2022-10-24},
abstract = {Norwegian Twitter data poses an interesting challenge for Natural Language Processing (NLP) tasks. These texts are difficult for models trained on standardized text in one of the two Norwegian written forms (Bokmål and Nynorsk), as they contain both the typical variation of social media text, as well as a large amount of dialectal variety. In this paper we present a novel Norwegian Twitter dataset annotated with POS-tags. We show that models trained on Universal Dependency (UD) data perform worse when evaluated against this dataset, and that models trained on Bokmål generally perform better than those trained on Nynorsk. We also see that performance on dialectal tweets is comparable to the written standards for some models. Finally we perform a detailed analysis of the errors that models commonly make on this data.},
keywords = {},
pubstate = {published},
tppubtype = {workshop}
}
Samia Touileb; Lilja Øvrelid; Erik Velldal
Occupational Biases in Norwegian and Multilingual Language Models Workshop
2022.
@workshop{Touileb2022,
title = {Occupational Biases in Norwegian and Multilingual Language Models},
author = {Samia Touileb and Lilja Øvrelid and Erik Velldal },
url = {https://mediafutures.no/2022-gebnlp-1-21/},
year = {2022},
date = {2022-07-01},
abstract = {In this paper we explore how a demographic distribution of occupations, along gender dimensions, is reflected in pre-trained language models. We give a descriptive assessment of the distribution of occupations, and investigate to what extent these are reflected in four Norwegian and two multilingual models. To this end, we introduce a set of simple bias probes, and perform five different tasks combining gendered pronouns, first names, and a set of occupations from the Norwegian statistics bureau. We show that language specific models obtain more accurate results, and are much closer to the real-world distribution of clearly gendered occupations. However, we see that none of the models have correct representations of the occupations that are demographically balanced between genders. We also discuss the importance of the training data on which the models were trained on, and argue that template-based bias probes can sometimes be fragile, and a simple alteration in a template can change a model’s behavior.},
keywords = {},
pubstate = {published},
tppubtype = {workshop}
}
2020
Samia Touileb; Lilja Øvrelid; Erik Velldal
Gender and sentiment, critics and authors: a dataset of Norwegian book reviews Journal Article
In: Gender Bias in Natural Language Processing. Association for Computational Linguistics, 2020, (Pre SFI).
@article{Touileb2020,
title = {Gender and sentiment, critics and authors: a dataset of Norwegian book reviews},
author = {Samia Touileb and Lilja Øvrelid and Erik Velldal},
url = {https://www.aclweb.org/anthology/2020.gebnlp-1.11.pdf},
year = {2020},
date = {2020-12-01},
journal = {Gender Bias in Natural Language Processing. Association for Computational Linguistics},
abstract = {Gender bias in models and datasets is widely studied in NLP. The focus has usually been on analysing how females and males express themselves, or how females and males are described. However, a less studied aspect is the combination of these two perspectives, how female and male describe the same or opposite gender. In this paper, we present a new gender annotated sentiment dataset of critics reviewing the works of female and male authors. We investigate if this newly annotated dataset contains differences in how the works of male and female authors are critiqued, in particular in terms of positive and negative sentiment. We also explore the differences in how this is done by male and female critics. We show that there are differences in how critics assess the works of authors of the same or opposite gender. For example, male critics rate crime novels written by females, and romantic and sentimental works written by males, more negatively.},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
J Barnes; Erik Velldal; Lilja Øvrelid
Improving sentiment analysis with multi-task learning of negation Journal Article
In: 2020, (Pre SFI).
BibTeX | Links:
@article{Barnes2020,
title = {Improving sentiment analysis with multi-task learning of negation},
author = {J Barnes and Erik Velldal and Lilja Øvrelid},
url = {https://www.cambridge.org/core/journals/natural-language-engineering/article/abs/improving-sentiment-analysis-with-multitask-learning-of-negation/14EF2B829EC4B8EC29E7C0C5C77B95B0},
year = {2020},
date = {2020-11-11},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
J Barnes; Lilja Øvrelid; Erik Velldal
Sentiment analysis is not solved! Assessing and probing sentiment classification Proceeding
2020, (Pre SFI).
BibTeX | Links:
@proceedings{Barnes2020b,
title = {Sentiment analysis is not solved! Assessing and probing sentiment classification},
author = {J Barnes and Lilja Øvrelid and Erik Velldal},
url = {https://www.aclweb.org/anthology/W19-4802/},
year = {2020},
date = {2020-08-01},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {proceedings}
}
Wafia Adouane; Samia Touileb; Jean-Philippe Bernardy
Identifying Sentiments in Algerian Code-switched User-generated Comments Conference
2020, (Pre SFI).
@conference{Adouane2020,
title = {Identifying Sentiments in Algerian Code-switched User-generated Comments},
author = {Wafia Adouane and Samia Touileb and Jean-Philippe Bernardy},
url = {https://www.aclweb.org/anthology/2020.lrec-1.328.pdf},
year = {2020},
date = {2020-05-06},
journal = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},
pages = {2698–2705},
abstract = {We present in this paper our work on Algerian language, an under-resourced North African colloquial Arabic variety, for which we
built a comparably large corpus of more than 36,000 code-switched user-generated comments annotated for sentiments. We opted
for this data domain because Algerian is a colloquial language with no existing freely available corpora. Moreover, we compiled
sentiment lexicons of positive and negative unigrams and bigrams reflecting the code-switches present in the language. We compare
the performance of four models on the task of identifying sentiments, and the results indicate that a CNN model trained end-to-end fits
better our unedited code-switched and unbalanced data across the predefined sentiment classes. Additionally, injecting the lexicons as
background knowledge to the model boosts its performance on the minority class with a gain of 10.54 points on the F-score. The results
of our experiments can be used as a baseline for future research for Algerian sentiment analysis.
},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
built a comparably large corpus of more than 36,000 code-switched user-generated comments annotated for sentiments. We opted
for this data domain because Algerian is a colloquial language with no existing freely available corpora. Moreover, we compiled
sentiment lexicons of positive and negative unigrams and bigrams reflecting the code-switches present in the language. We compare
the performance of four models on the task of identifying sentiments, and the results indicate that a CNN model trained end-to-end fits
better our unedited code-switched and unbalanced data across the predefined sentiment classes. Additionally, injecting the lexicons as
background knowledge to the model boosts its performance on the minority class with a gain of 10.54 points on the F-score. The results
of our experiments can be used as a baseline for future research for Algerian sentiment analysis.
Lilja Øvrelid; P Mæhlum; J Barnes; Erik Velldal
A Fine-Grained Sentiment Dataset for Norwegian Proceeding
2020, (Pre SFI).
BibTeX | Links:
@proceedings{Øvrelid2020,
title = {A Fine-Grained Sentiment Dataset for Norwegian},
author = {Lilja Øvrelid and P Mæhlum and J Barnes and Erik Velldal},
url = {https://www.aclweb.org/anthology/2020.lrec-1.618/},
year = {2020},
date = {2020-05-01},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {proceedings}
}
F Jørgensen; T Aasmoe; ASR Husevåg; Lilja Øvrelid; Erik Velldal (Ed.)
NorNE: Annotating Named Entities for Norwegian Proceeding
2020, (Pre SFI).
BibTeX | Links:
@proceedings{Jørgensen2020,
title = {NorNE: Annotating Named Entities for Norwegian},
editor = {F Jørgensen and T Aasmoe and ASR Husevåg and Lilja Øvrelid and Erik Velldal},
url = {https://oda.oslomet.no/handle/10642/8830},
year = {2020},
date = {2020-05-01},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {proceedings}
}
P Meurer; V Rosén; Koenraad De Smedt
Interactive Visualizations in INESS Book Chapter
In: Butt, M.; Hautli-Janisz, A.; (Eds.), V. Lyding (Ed.): 2020, (Pre SFI).
BibTeX | Links:
@inbook{Meurer2020,
title = {Interactive Visualizations in INESS},
author = {P Meurer and V Rosén and Koenraad De Smedt},
editor = {M. Butt and A. Hautli-Janisz and V. Lyding (Eds.)},
url = {https://web.stanford.edu/group/cslipublications/cslipublications/site/9781684000333.shtml},
year = {2020},
date = {2020-05-01},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {inbook}
}
Pierre Lison; Aliaksandr Hubin; Jeremy Barnes; Samia Touileb
Named Entity Recognition without Labelled Data: A Weak Supervision Approach Journal Article
In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp. 1518–1533, 2020, (Pre SFI).
@article{Lison2020,
title = {Named Entity Recognition without Labelled Data: A Weak Supervision Approach},
author = {Pierre Lison and Aliaksandr Hubin and Jeremy Barnes and Samia Touileb},
url = {https://arxiv.org/pdf/2004.14723.pdf},
year = {2020},
date = {2020-04-30},
journal = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
pages = {1518–1533},
abstract = {Named Entity Recognition (NER) performance often degrades rapidly when applied to target domains that differ from the texts observed during training. When in-domain labelled data is available, transfer learning techniques can be used to adapt existing NER models to the target domain. But what should one do when there is no hand-labelled data for the target domain? This paper presents a simple but powerful approach to learn NER models in the absence of labelled data through weak supervision. The approach relies on a broad spectrum of labelling functions to automatically annotate texts from the target domain. These annotations are then merged together using a hidden Markov model which captures the varying accuracies and confusions of the labelling functions. A sequence labelling model can finally be trained on the basis of this unified annotation. We evaluate the approach on two English datasets (CoNLL 2003 and news articles from Reuters and Bloomberg) and demonstrate an improvement of about 7 percentage points in entity-level F1 scores compared to an out-of-domain neural NER model.},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Koenraad de Smedt; D Koureas; P Wittenberg
FAIR Digital Objects for Science: From Data Pieces to Actionable Knowledge Units Journal Article
In: 2020, (Pre SFI).
BibTeX | Links:
@article{deSmedt2020,
title = {FAIR Digital Objects for Science: From Data Pieces to Actionable Knowledge Units},
author = {Koenraad de Smedt and D Koureas and P Wittenberg},
url = {https://ideas.repec.org/a/gam/jpubli/v8y2020i2p21-d344422.html},
year = {2020},
date = {2020-04-01},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2019
Jeremy Barnes; Samia Touileb; Lilja Øvrelid; Erik Velldal
Lexicon information in neural sentiment analysis: a multi-task learning approach Conference
Linköping University Electronic Press, 2019, (Pre SFI).
@conference{Barnes2019,
title = {Lexicon information in neural sentiment analysis: a multi-task learning approach},
author = {Jeremy Barnes and Samia Touileb and Lilja Øvrelid and Erik Velldal},
url = {https://www.aclweb.org/anthology/W19-6119.pdf},
year = {2019},
date = {2019-10-01},
journal = {Proceedings of the 22nd Nordic Conference on Computational Linguistics (NoDaLiDa)},
pages = {175–186},
publisher = {Linköping University Electronic Press},
abstract = {This paper explores the use of multi-task learning (MTL) for incorporating external knowledge in neural models. Specifically, we show how MTL can enable a BiLSTM sentiment classifier to incorporate information from sentiment lexicons. Our MTL set-up is shown to improve model performance (compared to a single-task set-up) on both English and Norwegian sentence-level sentiment datasets. The paper also introduces a new sentiment lexicon for Norwegian.},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
2018
A Kutuzov; Lilja Øvrelid; T Szymanski; Erik Velldal
Diachronic word embeddings and semantic shifts: a survey Proceeding
2018, (Pre SFI).
BibTeX | Links:
@proceedings{Kutuzov2018,
title = {Diachronic word embeddings and semantic shifts: a survey},
author = {A Kutuzov and Lilja Øvrelid and T Szymanski and Erik Velldal},
url = {https://www.aclweb.org/anthology/C18-1117/},
year = {2018},
date = {2018-08-01},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {proceedings}
}
Erik Velldal; Lilja Øvrelid; Eivind Alexander Bergem; Cathrine Stadsnes; Samia Touileb; Fredrik Jørgensen
NoReC: The Norwegian Review Corpus Proceeding
2018, (Pre SFI).
@proceedings{Velldal2018,
title = {NoReC: The Norwegian Review Corpus},
author = {Erik Velldal and Lilja Øvrelid and Eivind Alexander Bergem and Cathrine Stadsnes and Samia Touileb and Fredrik Jørgensen},
year = {2018},
date = {2018-05-12},
abstract = {https://repo.clarino.uib.no/xmlui/handle/11509/124},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {proceedings}
}
2017
Samia Touileb; Truls Pedersen; Helle Sjøvaag
Automatic identification of unknown names with specific roles Journal Article
In: Proceedings of the Second Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature, pp. 150-158, 2017, (Pre SFI).
@article{Touileb2017,
title = {Automatic identification of unknown names with specific roles},
author = {Samia Touileb and Truls Pedersen and Helle Sjøvaag},
url = {https://www.aclweb.org/anthology/W18-4517.pdf},
year = {2017},
date = {2017-08-01},
journal = {Proceedings of the Second Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature},
pages = {150-158},
abstract = {Automatically identifying persons in a particular role within a large corpus can be a difficult task, especially if you don’t know who you are actually looking for. Resources compiling names of persons can be available, but no exhaustive lists exist. However, such lists usually contain known names that are “visible” in the national public sphere, and tend to ignore the marginal and international ones. In this article we propose a method for automatically generating suggestions of names found in a corpus of Norwegian news articles, and which “naturally” belong to a given initial list of members, and that were not known (compiled in a list) beforehand. The approach is based, in part, on the assumption that surface level syntactic features reveal parts of the underlying semantic content and can help uncover the structure of the language.},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
M Fares; A Kutuzov; S Oepen; Erik Velldal
Word vectors, reuse, and replicability: Towards a community repository of large-text resources Proceeding
2017, (Pre SFI).
BibTeX | Links:
@proceedings{Fares2017,
title = { Word vectors, reuse, and replicability: Towards a community repository of large-text resources},
author = {M Fares and A Kutuzov and S Oepen and Erik Velldal},
url = {https://www.duo.uio.no/handle/10852/65205},
year = {2017},
date = {2017-05-22},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {proceedings}
}
2016
V Rosén; M Thunes; P Haugereid; GS Losnegaard; H Dyvik; P Meurer; G Lyse; Koenraad De Smedt
The enrichment of lexical resources through incremental parsebanking Journal Article
In: 2016, (Pre SFI).
BibTeX | Links:
@article{Rosén2016,
title = {The enrichment of lexical resources through incremental parsebanking},
author = {V Rosén and M Thunes and P Haugereid and GS Losnegaard and H Dyvik and P Meurer and G Lyse and Koenraad De Smedt},
url = {https://bora.uib.no/bora-xmlui/handle/1956/15680},
year = {2016},
date = {2016-06-01},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
H Dyvik; P Meurer; V Rosén; Koenraad De Smedt; P Haugereid; GS Losnegaard; G Lyse; M Thunes
NorGramBank: A 'Deep' Treebank for Norwegian.Proceedings of LREC Proceeding
2016, (Pre SFI).
BibTeX | Links:
@proceedings{Dyvik2016,
title = {NorGramBank: A 'Deep' Treebank for Norwegian.Proceedings of LREC},
author = {H Dyvik and P Meurer and V Rosén and Koenraad De Smedt and P Haugereid and GS Losnegaard and G Lyse and M Thunes},
url = {https://www.aclweb.org/anthology/L16-1565.pdf},
year = {2016},
date = {2016-05-16},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {proceedings}
}
Lilja Øvrelid; P Hohle
Universal dependencies for Norwegian Proceeding
2016, (Pre SFI).
BibTeX | Links:
@proceedings{Øvrelid2016,
title = { Universal dependencies for Norwegian},
author = {Lilja Øvrelid and P Hohle},
url = {https://www.aclweb.org/anthology/L16-1250/},
year = {2016},
date = {2016-05-01},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {proceedings}
}
V Rosén; Koenraad De Smedt; GS Losnegaard; E Bejcek; A Savary; P Osenova
MWEs in Treebanks: From Survey to Guidelines Proceeding
2016, (Pre SFI).
BibTeX | Links:
@proceedings{Rosén2016b,
title = {MWEs in Treebanks: From Survey to Guidelines},
author = {V Rosén and Koenraad De Smedt and GS Losnegaard and E Bejcek and A Savary and P Osenova},
url = {https://www.aclweb.org/anthology/L16-1368.pdf},
year = {2016},
date = {2016-05-01},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {proceedings}
}
2012
E Lapponi; J Read; Lilja Øvrelid
Representing and resolving negation for sentiment analysis Proceeding
2012, (Pre SFI).
BibTeX | Links:
@proceedings{Lapponi2012,
title = {Representing and resolving negation for sentiment analysis},
author = {E Lapponi and J Read and Lilja Øvrelid},
url = {https://ieeexplore.ieee.org/document/6406506},
year = {2012},
date = {2012-12-10},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {proceedings}
}
Erik Velldal; Lilja Øvrelid; J Read; S Oepen
Speculation and negation: Rules, rankers, and the role of syntax Journal Article
In: 2012, (Pre SFI).
BibTeX | Links:
@article{Velldal2012,
title = {Speculation and negation: Rules, rankers, and the role of syntax},
author = {Erik Velldal and Lilja Øvrelid and J Read and S Oepen},
url = {https://www.mitpressjournals.org/doi/pdf/10.1162/COLI_a_00126},
year = {2012},
date = {2012-01-01},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
/ Publications
2022
Touileb, Samia; Nozza, Debora
Measuring Harmful Representations in Scandinavian Language Models Conference
2022.
@conference{Touileb2022b,
title = {Measuring Harmful Representations in Scandinavian Language Models},
author = {Samia Touileb and Debora Nozza},
url = {https://mediafutures.no/2211-11678/},
year = {2022},
date = {2022-11-21},
urldate = {2022-11-21},
abstract = {Scandinavian countries are perceived as rolemodels when it comes to gender equality. With the advent of pre-trained language models and their widespread usage, we investigate to what extent gender-based harmful and toxic content exist in selected Scandinavian language models. We examine nine models, covering Danish, Swedish, and Norwegian, by manually creating template-based sentences and probing
the models for completion. We evaluate the completions using two methods for measuring harmful and toxic completions and provide a thorough analysis of the results. We show that Scandinavian pre-trained language models contain harmful and gender-based stereotypes with similar values across all languages.
This finding goes against the general expectations related to gender equality in Scandinavian countries and shows the possible problematic outcomes of using such models in real world settings.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
the models for completion. We evaluate the completions using two methods for measuring harmful and toxic completions and provide a thorough analysis of the results. We show that Scandinavian pre-trained language models contain harmful and gender-based stereotypes with similar values across all languages.
This finding goes against the general expectations related to gender equality in Scandinavian countries and shows the possible problematic outcomes of using such models in real world settings.
Andre Kåsen Petter Mæhlum, Samia Touileb
Annotating Norwegian language varieties on Twitter for Part-of-speech Workshop
2022.
@workshop{Mæhlum2022,
title = {Annotating Norwegian language varieties on Twitter for Part-of-speech},
author = {Petter Mæhlum, Andre Kåsen, Samia Touileb, Jeremy Barnes},
url = {https://mediafutures.no/2022-vardial-1-7/},
year = {2022},
date = {2022-10-24},
abstract = {Norwegian Twitter data poses an interesting challenge for Natural Language Processing (NLP) tasks. These texts are difficult for models trained on standardized text in one of the two Norwegian written forms (Bokmål and Nynorsk), as they contain both the typical variation of social media text, as well as a large amount of dialectal variety. In this paper we present a novel Norwegian Twitter dataset annotated with POS-tags. We show that models trained on Universal Dependency (UD) data perform worse when evaluated against this dataset, and that models trained on Bokmål generally perform better than those trained on Nynorsk. We also see that performance on dialectal tweets is comparable to the written standards for some models. Finally we perform a detailed analysis of the errors that models commonly make on this data.},
keywords = {},
pubstate = {published},
tppubtype = {workshop}
}
Touileb, Samia; Øvrelid, Lilja; Velldal, Erik
Occupational Biases in Norwegian and Multilingual Language Models Workshop
2022.
@workshop{Touileb2022,
title = {Occupational Biases in Norwegian and Multilingual Language Models},
author = {Samia Touileb and Lilja Øvrelid and Erik Velldal },
url = {https://mediafutures.no/2022-gebnlp-1-21/},
year = {2022},
date = {2022-07-01},
abstract = {In this paper we explore how a demographic distribution of occupations, along gender dimensions, is reflected in pre-trained language models. We give a descriptive assessment of the distribution of occupations, and investigate to what extent these are reflected in four Norwegian and two multilingual models. To this end, we introduce a set of simple bias probes, and perform five different tasks combining gendered pronouns, first names, and a set of occupations from the Norwegian statistics bureau. We show that language specific models obtain more accurate results, and are much closer to the real-world distribution of clearly gendered occupations. However, we see that none of the models have correct representations of the occupations that are demographically balanced between genders. We also discuss the importance of the training data on which the models were trained on, and argue that template-based bias probes can sometimes be fragile, and a simple alteration in a template can change a model’s behavior.},
keywords = {},
pubstate = {published},
tppubtype = {workshop}
}
2020
Touileb, Samia; Øvrelid, Lilja; Velldal, Erik
Gender and sentiment, critics and authors: a dataset of Norwegian book reviews Journal Article
In: Gender Bias in Natural Language Processing. Association for Computational Linguistics, 2020, (Pre SFI).
@article{Touileb2020,
title = {Gender and sentiment, critics and authors: a dataset of Norwegian book reviews},
author = {Samia Touileb and Lilja Øvrelid and Erik Velldal},
url = {https://www.aclweb.org/anthology/2020.gebnlp-1.11.pdf},
year = {2020},
date = {2020-12-01},
journal = {Gender Bias in Natural Language Processing. Association for Computational Linguistics},
abstract = {Gender bias in models and datasets is widely studied in NLP. The focus has usually been on analysing how females and males express themselves, or how females and males are described. However, a less studied aspect is the combination of these two perspectives, how female and male describe the same or opposite gender. In this paper, we present a new gender annotated sentiment dataset of critics reviewing the works of female and male authors. We investigate if this newly annotated dataset contains differences in how the works of male and female authors are critiqued, in particular in terms of positive and negative sentiment. We also explore the differences in how this is done by male and female critics. We show that there are differences in how critics assess the works of authors of the same or opposite gender. For example, male critics rate crime novels written by females, and romantic and sentimental works written by males, more negatively.},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Barnes, J; Velldal, Erik; Øvrelid, Lilja
Improving sentiment analysis with multi-task learning of negation Journal Article
In: 2020, (Pre SFI).
@article{Barnes2020,
title = {Improving sentiment analysis with multi-task learning of negation},
author = {J Barnes and Erik Velldal and Lilja Øvrelid},
url = {https://www.cambridge.org/core/journals/natural-language-engineering/article/abs/improving-sentiment-analysis-with-multitask-learning-of-negation/14EF2B829EC4B8EC29E7C0C5C77B95B0},
year = {2020},
date = {2020-11-11},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Barnes, J; Øvrelid, Lilja; Velldal, Erik
Sentiment analysis is not solved! Assessing and probing sentiment classification Proceeding
2020, (Pre SFI).
@proceedings{Barnes2020b,
title = {Sentiment analysis is not solved! Assessing and probing sentiment classification},
author = {J Barnes and Lilja Øvrelid and Erik Velldal},
url = {https://www.aclweb.org/anthology/W19-4802/},
year = {2020},
date = {2020-08-01},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {proceedings}
}
Adouane, Wafia; Touileb, Samia; Bernardy, Jean-Philippe
Identifying Sentiments in Algerian Code-switched User-generated Comments Conference
2020, (Pre SFI).
@conference{Adouane2020,
title = {Identifying Sentiments in Algerian Code-switched User-generated Comments},
author = {Wafia Adouane and Samia Touileb and Jean-Philippe Bernardy},
url = {https://www.aclweb.org/anthology/2020.lrec-1.328.pdf},
year = {2020},
date = {2020-05-06},
journal = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},
pages = {2698–2705},
abstract = {We present in this paper our work on Algerian language, an under-resourced North African colloquial Arabic variety, for which we
built a comparably large corpus of more than 36,000 code-switched user-generated comments annotated for sentiments. We opted
for this data domain because Algerian is a colloquial language with no existing freely available corpora. Moreover, we compiled
sentiment lexicons of positive and negative unigrams and bigrams reflecting the code-switches present in the language. We compare
the performance of four models on the task of identifying sentiments, and the results indicate that a CNN model trained end-to-end fits
better our unedited code-switched and unbalanced data across the predefined sentiment classes. Additionally, injecting the lexicons as
background knowledge to the model boosts its performance on the minority class with a gain of 10.54 points on the F-score. The results
of our experiments can be used as a baseline for future research for Algerian sentiment analysis.
},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
built a comparably large corpus of more than 36,000 code-switched user-generated comments annotated for sentiments. We opted
for this data domain because Algerian is a colloquial language with no existing freely available corpora. Moreover, we compiled
sentiment lexicons of positive and negative unigrams and bigrams reflecting the code-switches present in the language. We compare
the performance of four models on the task of identifying sentiments, and the results indicate that a CNN model trained end-to-end fits
better our unedited code-switched and unbalanced data across the predefined sentiment classes. Additionally, injecting the lexicons as
background knowledge to the model boosts its performance on the minority class with a gain of 10.54 points on the F-score. The results
of our experiments can be used as a baseline for future research for Algerian sentiment analysis.
Øvrelid, Lilja; Mæhlum, P; Barnes, J; Velldal, Erik
A Fine-Grained Sentiment Dataset for Norwegian Proceeding
2020, (Pre SFI).
@proceedings{Øvrelid2020,
title = {A Fine-Grained Sentiment Dataset for Norwegian},
author = {Lilja Øvrelid and P Mæhlum and J Barnes and Erik Velldal},
url = {https://www.aclweb.org/anthology/2020.lrec-1.618/},
year = {2020},
date = {2020-05-01},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {proceedings}
}
Jørgensen, F; Aasmoe, T; Husevåg, ASR; Øvrelid, Lilja; Velldal, Erik (Ed.)
NorNE: Annotating Named Entities for Norwegian Proceeding
2020, (Pre SFI).
@proceedings{Jørgensen2020,
title = {NorNE: Annotating Named Entities for Norwegian},
editor = {F Jørgensen and T Aasmoe and ASR Husevåg and Lilja Øvrelid and Erik Velldal},
url = {https://oda.oslomet.no/handle/10642/8830},
year = {2020},
date = {2020-05-01},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {proceedings}
}
Meurer, P; Rosén, V; Smedt, Koenraad De
Interactive Visualizations in INESS Book Chapter
In: Butt, M.; Hautli-Janisz, A.; (Eds.), V. Lyding (Ed.): 2020, (Pre SFI).
@inbook{Meurer2020,
title = {Interactive Visualizations in INESS},
author = {P Meurer and V Rosén and Koenraad De Smedt},
editor = {M. Butt and A. Hautli-Janisz and V. Lyding (Eds.)},
url = {https://web.stanford.edu/group/cslipublications/cslipublications/site/9781684000333.shtml},
year = {2020},
date = {2020-05-01},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {inbook}
}
Lison, Pierre; Hubin, Aliaksandr; Barnes, Jeremy; Touileb, Samia
Named Entity Recognition without Labelled Data: A Weak Supervision Approach Journal Article
In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp. 1518–1533, 2020, (Pre SFI).
@article{Lison2020,
title = {Named Entity Recognition without Labelled Data: A Weak Supervision Approach},
author = {Pierre Lison and Aliaksandr Hubin and Jeremy Barnes and Samia Touileb},
url = {https://arxiv.org/pdf/2004.14723.pdf},
year = {2020},
date = {2020-04-30},
journal = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
pages = {1518–1533},
abstract = {Named Entity Recognition (NER) performance often degrades rapidly when applied to target domains that differ from the texts observed during training. When in-domain labelled data is available, transfer learning techniques can be used to adapt existing NER models to the target domain. But what should one do when there is no hand-labelled data for the target domain? This paper presents a simple but powerful approach to learn NER models in the absence of labelled data through weak supervision. The approach relies on a broad spectrum of labelling functions to automatically annotate texts from the target domain. These annotations are then merged together using a hidden Markov model which captures the varying accuracies and confusions of the labelling functions. A sequence labelling model can finally be trained on the basis of this unified annotation. We evaluate the approach on two English datasets (CoNLL 2003 and news articles from Reuters and Bloomberg) and demonstrate an improvement of about 7 percentage points in entity-level F1 scores compared to an out-of-domain neural NER model.},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
de Smedt, Koenraad; Koureas, D; Wittenberg, P
FAIR Digital Objects for Science: From Data Pieces to Actionable Knowledge Units Journal Article
In: 2020, (Pre SFI).
@article{deSmedt2020,
title = {FAIR Digital Objects for Science: From Data Pieces to Actionable Knowledge Units},
author = {Koenraad de Smedt and D Koureas and P Wittenberg},
url = {https://ideas.repec.org/a/gam/jpubli/v8y2020i2p21-d344422.html},
year = {2020},
date = {2020-04-01},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2019
Barnes, Jeremy; Touileb, Samia; Øvrelid, Lilja; Velldal, Erik
Lexicon information in neural sentiment analysis: a multi-task learning approach Conference
Linköping University Electronic Press, 2019, (Pre SFI).
@conference{Barnes2019,
title = {Lexicon information in neural sentiment analysis: a multi-task learning approach},
author = {Jeremy Barnes and Samia Touileb and Lilja Øvrelid and Erik Velldal},
url = {https://www.aclweb.org/anthology/W19-6119.pdf},
year = {2019},
date = {2019-10-01},
journal = {Proceedings of the 22nd Nordic Conference on Computational Linguistics (NoDaLiDa)},
pages = {175–186},
publisher = {Linköping University Electronic Press},
abstract = {This paper explores the use of multi-task learning (MTL) for incorporating external knowledge in neural models. Specifically, we show how MTL can enable a BiLSTM sentiment classifier to incorporate information from sentiment lexicons. Our MTL set-up is shown to improve model performance (compared to a single-task set-up) on both English and Norwegian sentence-level sentiment datasets. The paper also introduces a new sentiment lexicon for Norwegian.},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
2018
Kutuzov, A; Øvrelid, Lilja; Szymanski, T; Velldal, Erik
Diachronic word embeddings and semantic shifts: a survey Proceeding
2018, (Pre SFI).
@proceedings{Kutuzov2018,
title = {Diachronic word embeddings and semantic shifts: a survey},
author = {A Kutuzov and Lilja Øvrelid and T Szymanski and Erik Velldal},
url = {https://www.aclweb.org/anthology/C18-1117/},
year = {2018},
date = {2018-08-01},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {proceedings}
}
Velldal, Erik; Øvrelid, Lilja; Bergem, Eivind Alexander; Stadsnes, Cathrine; Touileb, Samia; Jørgensen, Fredrik
NoReC: The Norwegian Review Corpus Proceeding
2018, (Pre SFI).
@proceedings{Velldal2018,
title = {NoReC: The Norwegian Review Corpus},
author = {Erik Velldal and Lilja Øvrelid and Eivind Alexander Bergem and Cathrine Stadsnes and Samia Touileb and Fredrik Jørgensen},
year = {2018},
date = {2018-05-12},
abstract = {https://repo.clarino.uib.no/xmlui/handle/11509/124},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {proceedings}
}
2017
Touileb, Samia; Pedersen, Truls; Sjøvaag, Helle
Automatic identification of unknown names with specific roles Journal Article
In: Proceedings of the Second Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature, pp. 150-158, 2017, (Pre SFI).
@article{Touileb2017,
title = {Automatic identification of unknown names with specific roles},
author = {Samia Touileb and Truls Pedersen and Helle Sjøvaag},
url = {https://www.aclweb.org/anthology/W18-4517.pdf},
year = {2017},
date = {2017-08-01},
journal = {Proceedings of the Second Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature},
pages = {150-158},
abstract = {Automatically identifying persons in a particular role within a large corpus can be a difficult task, especially if you don’t know who you are actually looking for. Resources compiling names of persons can be available, but no exhaustive lists exist. However, such lists usually contain known names that are “visible” in the national public sphere, and tend to ignore the marginal and international ones. In this article we propose a method for automatically generating suggestions of names found in a corpus of Norwegian news articles, and which “naturally” belong to a given initial list of members, and that were not known (compiled in a list) beforehand. The approach is based, in part, on the assumption that surface level syntactic features reveal parts of the underlying semantic content and can help uncover the structure of the language.},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Fares, M; Kutuzov, A; Oepen, S; Velldal, Erik
Word vectors, reuse, and replicability: Towards a community repository of large-text resources Proceeding
2017, (Pre SFI).
@proceedings{Fares2017,
title = { Word vectors, reuse, and replicability: Towards a community repository of large-text resources},
author = {M Fares and A Kutuzov and S Oepen and Erik Velldal},
url = {https://www.duo.uio.no/handle/10852/65205},
year = {2017},
date = {2017-05-22},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {proceedings}
}
2016
Rosén, V; Thunes, M; Haugereid, P; Losnegaard, GS; Dyvik, H; Meurer, P; Lyse, G; Smedt, Koenraad De
The enrichment of lexical resources through incremental parsebanking Journal Article
In: 2016, (Pre SFI).
@article{Rosén2016,
title = {The enrichment of lexical resources through incremental parsebanking},
author = {V Rosén and M Thunes and P Haugereid and GS Losnegaard and H Dyvik and P Meurer and G Lyse and Koenraad De Smedt},
url = {https://bora.uib.no/bora-xmlui/handle/1956/15680},
year = {2016},
date = {2016-06-01},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Dyvik, H; Meurer, P; Rosén, V; Smedt, Koenraad De; Haugereid, P; Losnegaard, GS; Lyse, G; Thunes, M
NorGramBank: A 'Deep' Treebank for Norwegian.Proceedings of LREC Proceeding
2016, (Pre SFI).
@proceedings{Dyvik2016,
title = {NorGramBank: A 'Deep' Treebank for Norwegian.Proceedings of LREC},
author = {H Dyvik and P Meurer and V Rosén and Koenraad De Smedt and P Haugereid and GS Losnegaard and G Lyse and M Thunes},
url = {https://www.aclweb.org/anthology/L16-1565.pdf},
year = {2016},
date = {2016-05-16},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {proceedings}
}
Øvrelid, Lilja; Hohle, P
Universal dependencies for Norwegian Proceeding
2016, (Pre SFI).
@proceedings{Øvrelid2016,
title = { Universal dependencies for Norwegian},
author = {Lilja Øvrelid and P Hohle},
url = {https://www.aclweb.org/anthology/L16-1250/},
year = {2016},
date = {2016-05-01},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {proceedings}
}
Rosén, V; Smedt, Koenraad De; Losnegaard, GS; Bejcek, E; Savary, A; Osenova, P
MWEs in Treebanks: From Survey to Guidelines Proceeding
2016, (Pre SFI).
@proceedings{Rosén2016b,
title = {MWEs in Treebanks: From Survey to Guidelines},
author = {V Rosén and Koenraad De Smedt and GS Losnegaard and E Bejcek and A Savary and P Osenova},
url = {https://www.aclweb.org/anthology/L16-1368.pdf},
year = {2016},
date = {2016-05-01},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {proceedings}
}
2012
Lapponi, E; Read, J; Øvrelid, Lilja
Representing and resolving negation for sentiment analysis Proceeding
2012, (Pre SFI).
@proceedings{Lapponi2012,
title = {Representing and resolving negation for sentiment analysis},
author = {E Lapponi and J Read and Lilja Øvrelid},
url = {https://ieeexplore.ieee.org/document/6406506},
year = {2012},
date = {2012-12-10},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {proceedings}
}
Velldal, Erik; Øvrelid, Lilja; Read, J; Oepen, S
Speculation and negation: Rules, rankers, and the role of syntax Journal Article
In: 2012, (Pre SFI).
@article{Velldal2012,
title = {Speculation and negation: Rules, rankers, and the role of syntax},
author = {Erik Velldal and Lilja Øvrelid and J Read and S Oepen},
url = {https://www.mitpressjournals.org/doi/pdf/10.1162/COLI_a_00126},
year = {2012},
date = {2012-01-01},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
/ Publications
2022
Touileb, Samia; Nozza, Debora
Measuring Harmful Representations in Scandinavian Language Models Conference
2022.
@conference{Touileb2022b,
title = {Measuring Harmful Representations in Scandinavian Language Models},
author = {Samia Touileb and Debora Nozza},
url = {https://mediafutures.no/2211-11678/},
year = {2022},
date = {2022-11-21},
urldate = {2022-11-21},
abstract = {Scandinavian countries are perceived as rolemodels when it comes to gender equality. With the advent of pre-trained language models and their widespread usage, we investigate to what extent gender-based harmful and toxic content exist in selected Scandinavian language models. We examine nine models, covering Danish, Swedish, and Norwegian, by manually creating template-based sentences and probing
the models for completion. We evaluate the completions using two methods for measuring harmful and toxic completions and provide a thorough analysis of the results. We show that Scandinavian pre-trained language models contain harmful and gender-based stereotypes with similar values across all languages.
This finding goes against the general expectations related to gender equality in Scandinavian countries and shows the possible problematic outcomes of using such models in real world settings.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
the models for completion. We evaluate the completions using two methods for measuring harmful and toxic completions and provide a thorough analysis of the results. We show that Scandinavian pre-trained language models contain harmful and gender-based stereotypes with similar values across all languages.
This finding goes against the general expectations related to gender equality in Scandinavian countries and shows the possible problematic outcomes of using such models in real world settings.
Andre Kåsen Petter Mæhlum, Samia Touileb
Annotating Norwegian language varieties on Twitter for Part-of-speech Workshop
2022.
@workshop{Mæhlum2022,
title = {Annotating Norwegian language varieties on Twitter for Part-of-speech},
author = {Petter Mæhlum, Andre Kåsen, Samia Touileb, Jeremy Barnes},
url = {https://mediafutures.no/2022-vardial-1-7/},
year = {2022},
date = {2022-10-24},
abstract = {Norwegian Twitter data poses an interesting challenge for Natural Language Processing (NLP) tasks. These texts are difficult for models trained on standardized text in one of the two Norwegian written forms (Bokmål and Nynorsk), as they contain both the typical variation of social media text, as well as a large amount of dialectal variety. In this paper we present a novel Norwegian Twitter dataset annotated with POS-tags. We show that models trained on Universal Dependency (UD) data perform worse when evaluated against this dataset, and that models trained on Bokmål generally perform better than those trained on Nynorsk. We also see that performance on dialectal tweets is comparable to the written standards for some models. Finally we perform a detailed analysis of the errors that models commonly make on this data.},
keywords = {},
pubstate = {published},
tppubtype = {workshop}
}
Touileb, Samia; Øvrelid, Lilja; Velldal, Erik
Occupational Biases in Norwegian and Multilingual Language Models Workshop
2022.
@workshop{Touileb2022,
title = {Occupational Biases in Norwegian and Multilingual Language Models},
author = {Samia Touileb and Lilja Øvrelid and Erik Velldal },
url = {https://mediafutures.no/2022-gebnlp-1-21/},
year = {2022},
date = {2022-07-01},
abstract = {In this paper we explore how a demographic distribution of occupations, along gender dimensions, is reflected in pre-trained language models. We give a descriptive assessment of the distribution of occupations, and investigate to what extent these are reflected in four Norwegian and two multilingual models. To this end, we introduce a set of simple bias probes, and perform five different tasks combining gendered pronouns, first names, and a set of occupations from the Norwegian statistics bureau. We show that language specific models obtain more accurate results, and are much closer to the real-world distribution of clearly gendered occupations. However, we see that none of the models have correct representations of the occupations that are demographically balanced between genders. We also discuss the importance of the training data on which the models were trained on, and argue that template-based bias probes can sometimes be fragile, and a simple alteration in a template can change a model’s behavior.},
keywords = {},
pubstate = {published},
tppubtype = {workshop}
}
2020
Touileb, Samia; Øvrelid, Lilja; Velldal, Erik
Gender and sentiment, critics and authors: a dataset of Norwegian book reviews Journal Article
In: Gender Bias in Natural Language Processing. Association for Computational Linguistics, 2020, (Pre SFI).
@article{Touileb2020,
title = {Gender and sentiment, critics and authors: a dataset of Norwegian book reviews},
author = {Samia Touileb and Lilja Øvrelid and Erik Velldal},
url = {https://www.aclweb.org/anthology/2020.gebnlp-1.11.pdf},
year = {2020},
date = {2020-12-01},
journal = {Gender Bias in Natural Language Processing. Association for Computational Linguistics},
abstract = {Gender bias in models and datasets is widely studied in NLP. The focus has usually been on analysing how females and males express themselves, or how females and males are described. However, a less studied aspect is the combination of these two perspectives, how female and male describe the same or opposite gender. In this paper, we present a new gender annotated sentiment dataset of critics reviewing the works of female and male authors. We investigate if this newly annotated dataset contains differences in how the works of male and female authors are critiqued, in particular in terms of positive and negative sentiment. We also explore the differences in how this is done by male and female critics. We show that there are differences in how critics assess the works of authors of the same or opposite gender. For example, male critics rate crime novels written by females, and romantic and sentimental works written by males, more negatively.},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Barnes, J; Velldal, Erik; Øvrelid, Lilja
Improving sentiment analysis with multi-task learning of negation Journal Article
In: 2020, (Pre SFI).
@article{Barnes2020,
title = {Improving sentiment analysis with multi-task learning of negation},
author = {J Barnes and Erik Velldal and Lilja Øvrelid},
url = {https://www.cambridge.org/core/journals/natural-language-engineering/article/abs/improving-sentiment-analysis-with-multitask-learning-of-negation/14EF2B829EC4B8EC29E7C0C5C77B95B0},
year = {2020},
date = {2020-11-11},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Barnes, J; Øvrelid, Lilja; Velldal, Erik
Sentiment analysis is not solved! Assessing and probing sentiment classification Proceeding
2020, (Pre SFI).
@proceedings{Barnes2020b,
title = {Sentiment analysis is not solved! Assessing and probing sentiment classification},
author = {J Barnes and Lilja Øvrelid and Erik Velldal},
url = {https://www.aclweb.org/anthology/W19-4802/},
year = {2020},
date = {2020-08-01},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {proceedings}
}
Adouane, Wafia; Touileb, Samia; Bernardy, Jean-Philippe
Identifying Sentiments in Algerian Code-switched User-generated Comments Conference
2020, (Pre SFI).
@conference{Adouane2020,
title = {Identifying Sentiments in Algerian Code-switched User-generated Comments},
author = {Wafia Adouane and Samia Touileb and Jean-Philippe Bernardy},
url = {https://www.aclweb.org/anthology/2020.lrec-1.328.pdf},
year = {2020},
date = {2020-05-06},
journal = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},
pages = {2698–2705},
abstract = {We present in this paper our work on Algerian language, an under-resourced North African colloquial Arabic variety, for which we
built a comparably large corpus of more than 36,000 code-switched user-generated comments annotated for sentiments. We opted
for this data domain because Algerian is a colloquial language with no existing freely available corpora. Moreover, we compiled
sentiment lexicons of positive and negative unigrams and bigrams reflecting the code-switches present in the language. We compare
the performance of four models on the task of identifying sentiments, and the results indicate that a CNN model trained end-to-end fits
better our unedited code-switched and unbalanced data across the predefined sentiment classes. Additionally, injecting the lexicons as
background knowledge to the model boosts its performance on the minority class with a gain of 10.54 points on the F-score. The results
of our experiments can be used as a baseline for future research for Algerian sentiment analysis.
},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
built a comparably large corpus of more than 36,000 code-switched user-generated comments annotated for sentiments. We opted
for this data domain because Algerian is a colloquial language with no existing freely available corpora. Moreover, we compiled
sentiment lexicons of positive and negative unigrams and bigrams reflecting the code-switches present in the language. We compare
the performance of four models on the task of identifying sentiments, and the results indicate that a CNN model trained end-to-end fits
better our unedited code-switched and unbalanced data across the predefined sentiment classes. Additionally, injecting the lexicons as
background knowledge to the model boosts its performance on the minority class with a gain of 10.54 points on the F-score. The results
of our experiments can be used as a baseline for future research for Algerian sentiment analysis.
Øvrelid, Lilja; Mæhlum, P; Barnes, J; Velldal, Erik
A Fine-Grained Sentiment Dataset for Norwegian Proceeding
2020, (Pre SFI).
@proceedings{Øvrelid2020,
title = {A Fine-Grained Sentiment Dataset for Norwegian},
author = {Lilja Øvrelid and P Mæhlum and J Barnes and Erik Velldal},
url = {https://www.aclweb.org/anthology/2020.lrec-1.618/},
year = {2020},
date = {2020-05-01},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {proceedings}
}
Jørgensen, F; Aasmoe, T; Husevåg, ASR; Øvrelid, Lilja; Velldal, Erik (Ed.)
NorNE: Annotating Named Entities for Norwegian Proceeding
2020, (Pre SFI).
@proceedings{Jørgensen2020,
title = {NorNE: Annotating Named Entities for Norwegian},
editor = {F Jørgensen and T Aasmoe and ASR Husevåg and Lilja Øvrelid and Erik Velldal},
url = {https://oda.oslomet.no/handle/10642/8830},
year = {2020},
date = {2020-05-01},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {proceedings}
}
Meurer, P; Rosén, V; Smedt, Koenraad De
Interactive Visualizations in INESS Book Chapter
In: Butt, M.; Hautli-Janisz, A.; (Eds.), V. Lyding (Ed.): 2020, (Pre SFI).
@inbook{Meurer2020,
title = {Interactive Visualizations in INESS},
author = {P Meurer and V Rosén and Koenraad De Smedt},
editor = {M. Butt and A. Hautli-Janisz and V. Lyding (Eds.)},
url = {https://web.stanford.edu/group/cslipublications/cslipublications/site/9781684000333.shtml},
year = {2020},
date = {2020-05-01},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {inbook}
}
Lison, Pierre; Hubin, Aliaksandr; Barnes, Jeremy; Touileb, Samia
Named Entity Recognition without Labelled Data: A Weak Supervision Approach Journal Article
In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp. 1518–1533, 2020, (Pre SFI).
@article{Lison2020,
title = {Named Entity Recognition without Labelled Data: A Weak Supervision Approach},
author = {Pierre Lison and Aliaksandr Hubin and Jeremy Barnes and Samia Touileb},
url = {https://arxiv.org/pdf/2004.14723.pdf},
year = {2020},
date = {2020-04-30},
journal = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
pages = {1518–1533},
abstract = {Named Entity Recognition (NER) performance often degrades rapidly when applied to target domains that differ from the texts observed during training. When in-domain labelled data is available, transfer learning techniques can be used to adapt existing NER models to the target domain. But what should one do when there is no hand-labelled data for the target domain? This paper presents a simple but powerful approach to learn NER models in the absence of labelled data through weak supervision. The approach relies on a broad spectrum of labelling functions to automatically annotate texts from the target domain. These annotations are then merged together using a hidden Markov model which captures the varying accuracies and confusions of the labelling functions. A sequence labelling model can finally be trained on the basis of this unified annotation. We evaluate the approach on two English datasets (CoNLL 2003 and news articles from Reuters and Bloomberg) and demonstrate an improvement of about 7 percentage points in entity-level F1 scores compared to an out-of-domain neural NER model.},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
de Smedt, Koenraad; Koureas, D; Wittenberg, P
FAIR Digital Objects for Science: From Data Pieces to Actionable Knowledge Units Journal Article
In: 2020, (Pre SFI).
@article{deSmedt2020,
title = {FAIR Digital Objects for Science: From Data Pieces to Actionable Knowledge Units},
author = {Koenraad de Smedt and D Koureas and P Wittenberg},
url = {https://ideas.repec.org/a/gam/jpubli/v8y2020i2p21-d344422.html},
year = {2020},
date = {2020-04-01},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2019
Barnes, Jeremy; Touileb, Samia; Øvrelid, Lilja; Velldal, Erik
Lexicon information in neural sentiment analysis: a multi-task learning approach Conference
Linköping University Electronic Press, 2019, (Pre SFI).
@conference{Barnes2019,
title = {Lexicon information in neural sentiment analysis: a multi-task learning approach},
author = {Jeremy Barnes and Samia Touileb and Lilja Øvrelid and Erik Velldal},
url = {https://www.aclweb.org/anthology/W19-6119.pdf},
year = {2019},
date = {2019-10-01},
journal = {Proceedings of the 22nd Nordic Conference on Computational Linguistics (NoDaLiDa)},
pages = {175–186},
publisher = {Linköping University Electronic Press},
abstract = {This paper explores the use of multi-task learning (MTL) for incorporating external knowledge in neural models. Specifically, we show how MTL can enable a BiLSTM sentiment classifier to incorporate information from sentiment lexicons. Our MTL set-up is shown to improve model performance (compared to a single-task set-up) on both English and Norwegian sentence-level sentiment datasets. The paper also introduces a new sentiment lexicon for Norwegian.},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
2018
Kutuzov, A; Øvrelid, Lilja; Szymanski, T; Velldal, Erik
Diachronic word embeddings and semantic shifts: a survey Proceeding
2018, (Pre SFI).
@proceedings{Kutuzov2018,
title = {Diachronic word embeddings and semantic shifts: a survey},
author = {A Kutuzov and Lilja Øvrelid and T Szymanski and Erik Velldal},
url = {https://www.aclweb.org/anthology/C18-1117/},
year = {2018},
date = {2018-08-01},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {proceedings}
}
Velldal, Erik; Øvrelid, Lilja; Bergem, Eivind Alexander; Stadsnes, Cathrine; Touileb, Samia; Jørgensen, Fredrik
NoReC: The Norwegian Review Corpus Proceeding
2018, (Pre SFI).
@proceedings{Velldal2018,
title = {NoReC: The Norwegian Review Corpus},
author = {Erik Velldal and Lilja Øvrelid and Eivind Alexander Bergem and Cathrine Stadsnes and Samia Touileb and Fredrik Jørgensen},
year = {2018},
date = {2018-05-12},
abstract = {https://repo.clarino.uib.no/xmlui/handle/11509/124},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {proceedings}
}
2017
Touileb, Samia; Pedersen, Truls; Sjøvaag, Helle
Automatic identification of unknown names with specific roles Journal Article
In: Proceedings of the Second Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature, pp. 150-158, 2017, (Pre SFI).
@article{Touileb2017,
title = {Automatic identification of unknown names with specific roles},
author = {Samia Touileb and Truls Pedersen and Helle Sjøvaag},
url = {https://www.aclweb.org/anthology/W18-4517.pdf},
year = {2017},
date = {2017-08-01},
journal = {Proceedings of the Second Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature},
pages = {150-158},
abstract = {Automatically identifying persons in a particular role within a large corpus can be a difficult task, especially if you don’t know who you are actually looking for. Resources compiling names of persons can be available, but no exhaustive lists exist. However, such lists usually contain known names that are “visible” in the national public sphere, and tend to ignore the marginal and international ones. In this article we propose a method for automatically generating suggestions of names found in a corpus of Norwegian news articles, and which “naturally” belong to a given initial list of members, and that were not known (compiled in a list) beforehand. The approach is based, in part, on the assumption that surface level syntactic features reveal parts of the underlying semantic content and can help uncover the structure of the language.},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Fares, M; Kutuzov, A; Oepen, S; Velldal, Erik
Word vectors, reuse, and replicability: Towards a community repository of large-text resources Proceeding
2017, (Pre SFI).
@proceedings{Fares2017,
title = { Word vectors, reuse, and replicability: Towards a community repository of large-text resources},
author = {M Fares and A Kutuzov and S Oepen and Erik Velldal},
url = {https://www.duo.uio.no/handle/10852/65205},
year = {2017},
date = {2017-05-22},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {proceedings}
}
2016
Rosén, V; Thunes, M; Haugereid, P; Losnegaard, GS; Dyvik, H; Meurer, P; Lyse, G; Smedt, Koenraad De
The enrichment of lexical resources through incremental parsebanking Journal Article
In: 2016, (Pre SFI).
@article{Rosén2016,
title = {The enrichment of lexical resources through incremental parsebanking},
author = {V Rosén and M Thunes and P Haugereid and GS Losnegaard and H Dyvik and P Meurer and G Lyse and Koenraad De Smedt},
url = {https://bora.uib.no/bora-xmlui/handle/1956/15680},
year = {2016},
date = {2016-06-01},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Dyvik, H; Meurer, P; Rosén, V; Smedt, Koenraad De; Haugereid, P; Losnegaard, GS; Lyse, G; Thunes, M
NorGramBank: A 'Deep' Treebank for Norwegian.Proceedings of LREC Proceeding
2016, (Pre SFI).
@proceedings{Dyvik2016,
title = {NorGramBank: A 'Deep' Treebank for Norwegian.Proceedings of LREC},
author = {H Dyvik and P Meurer and V Rosén and Koenraad De Smedt and P Haugereid and GS Losnegaard and G Lyse and M Thunes},
url = {https://www.aclweb.org/anthology/L16-1565.pdf},
year = {2016},
date = {2016-05-16},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {proceedings}
}
Øvrelid, Lilja; Hohle, P
Universal dependencies for Norwegian Proceeding
2016, (Pre SFI).
@proceedings{Øvrelid2016,
title = { Universal dependencies for Norwegian},
author = {Lilja Øvrelid and P Hohle},
url = {https://www.aclweb.org/anthology/L16-1250/},
year = {2016},
date = {2016-05-01},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {proceedings}
}
Rosén, V; Smedt, Koenraad De; Losnegaard, GS; Bejcek, E; Savary, A; Osenova, P
MWEs in Treebanks: From Survey to Guidelines Proceeding
2016, (Pre SFI).
@proceedings{Rosén2016b,
title = {MWEs in Treebanks: From Survey to Guidelines},
author = {V Rosén and Koenraad De Smedt and GS Losnegaard and E Bejcek and A Savary and P Osenova},
url = {https://www.aclweb.org/anthology/L16-1368.pdf},
year = {2016},
date = {2016-05-01},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {proceedings}
}
2012
Lapponi, E; Read, J; Øvrelid, Lilja
Representing and resolving negation for sentiment analysis Proceeding
2012, (Pre SFI).
@proceedings{Lapponi2012,
title = {Representing and resolving negation for sentiment analysis},
author = {E Lapponi and J Read and Lilja Øvrelid},
url = {https://ieeexplore.ieee.org/document/6406506},
year = {2012},
date = {2012-12-10},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {proceedings}
}
Velldal, Erik; Øvrelid, Lilja; Read, J; Oepen, S
Speculation and negation: Rules, rankers, and the role of syntax Journal Article
In: 2012, (Pre SFI).
@article{Velldal2012,
title = {Speculation and negation: Rules, rankers, and the role of syntax},
author = {Erik Velldal and Lilja Øvrelid and J Read and S Oepen},
url = {https://www.mitpressjournals.org/doi/pdf/10.1162/COLI_a_00126},
year = {2012},
date = {2012-01-01},
note = {Pre SFI},
keywords = {},
pubstate = {published},
tppubtype = {article}
}