# The collection of aggregators, i.e., functions that take in a data frame # which has columns Item, Annotator, and Category, and return a named vector # representing the collective annotation for each item source("aux.R") # Simple Plurality Rule (SPR) SPR <- function(dframe){ # Compute the result of the simple plurality rule (NAs for ties) # # Args: # dframe: a data frame containing columns Item, Annotator, and Category # # Result: # a named vector each element of which is the most chosen category for the # corresponding item. If there is a tie, it will be an NA # Error handling CheckValidity(dframe, c("Item", "Annotator", "Category")) return(tapply(dframe$Category, dframe$Item, MaxOccurrence)) } # Weighted Plurality Rule (WPR) WPR <- function(dframe, wt, bias=function(cat)0){ # Compute the result of the weighted plurality rule (NAs for ties) # # Args: # dframe: a data frame containing columns Item, Annotator, and Category # wt: a weighting function that takes in Item, Annotator and Category and # return the corresponding weight # # Result: # a named vector each element of which is the category with the greatest # weight for the corresponding item. If there is a tie, it will be an NA # Error handling CheckValidity(dframe, c("Item", "Annotator", "Category")) sapply(split(dframe, dframe$Item), function(d)WeightedWinner(d, wt, bias)) } COM <- function(dframe, K=length(unique(dframe$Category))){ # Compute the result of the COM rule # # Args: # dframe: a data frame containing columns Item, Annotator, and Category # K: the total number of categories # # Result: # a named vector each element of which is the category with the greatest # weight for the corresponding item. If there is a tie, it will be an NA tab.com <- IndFreq(dframe) wt.com <- function(item, annotator, category){ relfreq <- tab.com[[as.character(annotator)]][as.character(category)] if (is.null(relfreq) || is.na(relfreq) || length(relfreq) != 1){ stop("Unknown annotator or category, or wrong data type!") } return(1 + 1 / K - relfreq) } return(WPR(dframe, wt.com)) } INV <- function(dframe){ # Compute the result of the INV rule # # Args: # dframe: a data frame containing columns Item, Annotator, and Category # # Result: # a named vector each element of which is the category with the greatest # weight for the corresponding item. If there is a tie, it will be an NA tab.inv <- IndFreq(dframe) wt.inv <- function(item, annotator, category){ relfreq <- tab.inv[[as.character(annotator)]][as.character(category)] if (is.null(relfreq) || is.na(relfreq) || length(relfreq) != 1){ stop("Unknown annotator or category, or wrong data type!") } return(1 / relfreq) } return(WPR(dframe, wt.inv)) } DIFF <- function(dframe){ # Compute the result of the DIFF rule # # Args: # dframe: a data frame containing columns Item, Annotator, and Category # # Result: # a named vector each element of which is the category with the greatest # weight for the corresponding item. If there is a tie, it will be an NA tab.ind <- IndFreq(dframe) tab.global <- table(dframe$Category) / length(dframe$Category) wt.diff <- function(item, annotator, category){ relfreq <- tab.ind[[as.character(annotator)]][as.character(category)] if (is.null(relfreq) || is.na(relfreq) || length(relfreq) != 1){ stop("Unknown annotator or category, or wrong data type!") } return(1 + tab.global[as.character(category)] - relfreq) } return(WPR(dframe, wt.diff)) } RAT <- function(dframe){ # Compute the result of the RAT rule # # Args: # dframe: a data frame containing columns Item, Annotator, and Category # # Result: # a named vector each element of which is the category with the greatest # weight for the corresponding item. If there is a tie, it will be an NA tab.ind <- IndFreq(dframe) tab.global <- table(dframe$Category) / length(dframe$Category) wt.rat <- function(item, annotator, category){ relfreq <- tab.ind[[as.character(annotator)]][as.character(category)] if (is.null(relfreq) || is.na(relfreq) || length(relfreq) != 1){ stop("Unknown annotator or category, or wrong data type!") } return(tab.global[as.character(category)] / relfreq) } return(WPR(dframe, wt.rat)) } # agreement-based weights WeightsViaAgreement <- function(dframe, ann.base, K=length(unique(dframe$Category))){ # Compute the weights based on agreement # Args: # dframe: a data frame containing columns Item, Annotator, and Category # ann.base: a named vector representing an annotation based on which the # weights are derived # K: the total number of categories # # Result: # the weighting function derived from agreement with ann.base, # with Laplace smoothing (0.5) applied # v <- dframe$Category names(v) <- dframe$Item ann.dict <- tapply(v, dframe$Annotator, function(ann){(sum(AgreementVector(ann, ann.base), na.rm=TRUE) + 0.5) / (length(ann) + 1)}) wt <- function(item, annotator, category){ acc <- ann.dict[as.character(annotator)] return(log((K-1) * acc / (1 - acc))) } return(wt) } AGR <- function(dframe, K=length(unique(dframe$Category))){ # Compute the result of the AGR rule # # Args: # dframe: a data frame containing columns Item, Annotator, and Category # K: the total number of categories # # Result: # a named vector each element of which is the category with the greatest # weight for the corresponding item. If there is a tie, it will be an NA # return(WPR(dframe, WeightsViaAgreement(dframe, SPR(dframe), K))) } ORA <- function(dframe, K=length(unique(dframe$Category))){ # Compute the result of the ORA rule # # Args: # dframe: a data frame containing columns Item, Annotator, Category, # and Gold # K: the total number of categories # # Result: # a named vector each element of which is the category with the greatest # weight for the corresponding item. If there is a tie, it will be an NA # return(WPR(dframe, WeightsViaAgreement(dframe, ReadGold(dframe), K))) } AGR.PRIOR <- function(dframe, K=length(unique(dframe$Category))){ # Compute the result of the AGR rule, using estimated category priors as well # # Args: # dframe: a data frame containing columns Item, Annotator, and Category # K: the total number of categories # # Result: # a named vector each element of which is the category with the greatest # weight for the corresponding item. If there is a tie, it will be an NA # tab.global <- table(dframe$Category) / length(dframe$Category) return(WPR(dframe, WeightsViaAgreement(dframe, SPR(dframe), K), bias=function(cat)log(tab.global[as.character(cat)]))) } AGR.ITER <- function(dframe, K=length(unique(dframe$Category)), iter.max=50){ # Compute the result of the iterated AGR rule # # Args: # dframe: a data frame containing columns Item, Annotator, and Category # K: the total number of categories # iter.max: the maximal number of iterations # # Result: # a named vector each element of which is the category with the greatest # weight for the corresponding item. If there is a tie, it will be an NA # If the iteration converges, the function will print out the number of # steps taken, otherwise there will be a warning ann.current <- WPR(dframe, WeightsViaAgreement(dframe, SPR(dframe), K)) for (i in 1:iter.max){ ann.next <- WPR(dframe, WeightsViaAgreement(dframe, ann.current, K)) same <- all(as.logical(AgreementVector(ann.next, ann.current))) if (!is.na(same) && same){ print(paste("Converged after", i, "iterations")) return(ann.current) } ann.current <- ann.next } warning(paste("Not converged after", iter.max, "iterations")) return(ann.current) }