# The collection of aggregators, i.e., functions that take in a data frame 
# which has columns Item, Annotator, and Category, and return a named vector
# representing the collective annotation for each item

source("aux.R")

# Simple Plurality Rule (SPR)
SPR <- function(dframe){
  # Compute the result of the simple plurality rule (NAs for ties)
  #
  # Args: 
  #   dframe: a data frame containing columns Item, Annotator, and Category
  #
  # Result:
  #   a named vector each element of which is the most chosen category for the
  # corresponding item. If there is a tie, it will be an NA
  
  # Error handling
  CheckValidity(dframe, c("Item", "Annotator", "Category"))

  return(tapply(dframe$Category, dframe$Item, MaxOccurrence))
}

# Weighted Plurality Rule (WPR)
WPR <- function(dframe, wt, bias=function(cat)0){
  # Compute the result of the weighted plurality rule (NAs for ties)
  #
  # Args: 
  #   dframe: a data frame containing columns Item, Annotator, and Category
  #   wt: a weighting function that takes in Item, Annotator and Category and 
  #     return the corresponding weight
  #
  # Result:
  #   a named vector each element of which is the category with the greatest 
  #   weight for the corresponding item. If there is a tie, it will be an NA
  
  # Error handling
  CheckValidity(dframe, c("Item", "Annotator", "Category"))
  
  sapply(split(dframe, dframe$Item), function(d)WeightedWinner(d, wt, bias))
}

COM <- function(dframe, K=length(unique(dframe$Category))){
  # Compute the result of the COM rule
  #
  # Args: 
  #   dframe: a data frame containing columns Item, Annotator, and Category
  #   K: the total number of categories
  #
  # Result:
  #   a named vector each element of which is the category with the greatest 
  #   weight for the corresponding item. If there is a tie, it will be an NA
  
  tab.com <- IndFreq(dframe)
  
  wt.com <- function(item, annotator, category){
    relfreq <- tab.com[[as.character(annotator)]][as.character(category)]
    if (is.null(relfreq) || is.na(relfreq) || length(relfreq) != 1){
      stop("Unknown annotator or category, or wrong data type!")
    }
    return(1 + 1 / K - relfreq)
  }
  
  return(WPR(dframe, wt.com))
  
}

INV <- function(dframe){
  # Compute the result of the INV rule
  #
  # Args: 
  #   dframe: a data frame containing columns Item, Annotator, and Category
  #
  # Result:
  #   a named vector each element of which is the category with the greatest 
  #   weight for the corresponding item. If there is a tie, it will be an NA
  
  tab.inv <- IndFreq(dframe)
  
  wt.inv <- function(item, annotator, category){
    relfreq <- tab.inv[[as.character(annotator)]][as.character(category)]
    if (is.null(relfreq) || is.na(relfreq) || length(relfreq) != 1){
      stop("Unknown annotator or category, or wrong data type!")
    }
    return(1 / relfreq)
  }
  
  return(WPR(dframe, wt.inv))
  
}

DIFF <- function(dframe){
  # Compute the result of the DIFF rule
  #
  # Args: 
  #   dframe: a data frame containing columns Item, Annotator, and Category
  #
  # Result:
  #   a named vector each element of which is the category with the greatest 
  #   weight for the corresponding item. If there is a tie, it will be an NA
  
  tab.ind <- IndFreq(dframe)
  tab.global <- table(dframe$Category) / length(dframe$Category)
  
  wt.diff <- function(item, annotator, category){
    relfreq <- tab.ind[[as.character(annotator)]][as.character(category)]
    if (is.null(relfreq) || is.na(relfreq) || length(relfreq) != 1){
      stop("Unknown annotator or category, or wrong data type!")
    }
    return(1 + tab.global[as.character(category)] - relfreq)
  }
  
  return(WPR(dframe, wt.diff))
  
}

RAT <- function(dframe){
  # Compute the result of the RAT rule
  #
  # Args: 
  #   dframe: a data frame containing columns Item, Annotator, and Category
  #
  # Result:
  #   a named vector each element of which is the category with the greatest 
  #   weight for the corresponding item. If there is a tie, it will be an NA
  
  tab.ind <- IndFreq(dframe)
  tab.global <- table(dframe$Category) / length(dframe$Category)
  
  wt.rat <- function(item, annotator, category){
    relfreq <- tab.ind[[as.character(annotator)]][as.character(category)]
    if (is.null(relfreq) || is.na(relfreq) || length(relfreq) != 1){
      stop("Unknown annotator or category, or wrong data type!")
    }
    return(tab.global[as.character(category)] / relfreq)
  }
  
  return(WPR(dframe, wt.rat))
  
}

# agreement-based weights
WeightsViaAgreement <- function(dframe, ann.base, 
                                K=length(unique(dframe$Category))){
  # Compute the weights based on agreement
  
  # Args:
  #   dframe: a data frame containing columns Item, Annotator, and Category
  #   ann.base: a named vector representing an annotation based on which the 
  #             weights are derived
  #   K: the total number of categories
  #
  # Result: 
  #   the weighting function derived from agreement with ann.base, 
  #   with Laplace smoothing (0.5) applied
  #
  
  v <- dframe$Category
  names(v) <- dframe$Item
  
  ann.dict <- tapply(v, dframe$Annotator, 
                     function(ann){(sum(AgreementVector(ann, ann.base), 
                                        na.rm=TRUE) + 0.5) / (length(ann) + 1)})
  
  wt <- function(item, annotator, category){
    acc <- ann.dict[as.character(annotator)]
    return(log((K-1) * acc / (1 - acc)))
  }
  return(wt)
}


AGR <- function(dframe, K=length(unique(dframe$Category))){
  # Compute the result of the AGR rule
  #
  # Args: 
  #   dframe: a data frame containing columns Item, Annotator, and Category
  #   K: the total number of categories
  #
  # Result:
  #   a named vector each element of which is the category with the greatest 
  #   weight for the corresponding item. If there is a tie, it will be an NA
  #
    
    return(WPR(dframe, 
               WeightsViaAgreement(dframe, SPR(dframe), K)))
}

ORA <- function(dframe, K=length(unique(dframe$Category))){
  # Compute the result of the ORA rule
  #
  # Args: 
  #   dframe: a data frame containing columns Item, Annotator, Category, 
  #           and Gold
  #   K: the total number of categories
  #
  # Result:
  #   a named vector each element of which is the category with the greatest 
  #   weight for the corresponding item. If there is a tie, it will be an NA
  #
  
  return(WPR(dframe, 
             WeightsViaAgreement(dframe, ReadGold(dframe), K)))
}

AGR.PRIOR <- function(dframe, K=length(unique(dframe$Category))){
  # Compute the result of the AGR rule, using estimated category priors as well 
  #
  # Args: 
  #   dframe: a data frame containing columns Item, Annotator, and Category
  #   K: the total number of categories
  #
  # Result:
  #   a named vector each element of which is the category with the greatest 
  #   weight for the corresponding item. If there is a tie, it will be an NA
  #
  
  tab.global <- table(dframe$Category) / length(dframe$Category)
  
  return(WPR(dframe, 
             WeightsViaAgreement(dframe, SPR(dframe), K),
             bias=function(cat)log(tab.global[as.character(cat)])))
}

AGR.ITER <- function(dframe, K=length(unique(dframe$Category)), iter.max=50){
  # Compute the result of the iterated AGR rule
  #
  # Args: 
  #   dframe: a data frame containing columns Item, Annotator, and Category
  #   K: the total number of categories
  #   iter.max: the maximal number of iterations
  #
  # Result:
  #   a named vector each element of which is the category with the greatest 
  #   weight for the corresponding item. If there is a tie, it will be an NA
  #   If the iteration converges, the function will print out the number of 
  #   steps taken, otherwise there will be a warning
  
  ann.current <- WPR(dframe, WeightsViaAgreement(dframe, SPR(dframe), K))
  
  for (i in 1:iter.max){
    ann.next <- WPR(dframe, WeightsViaAgreement(dframe, ann.current, K))
    same <- all(as.logical(AgreementVector(ann.next, ann.current)))
    if (!is.na(same) && same){
      print(paste("Converged after", i, "iterations"))
      return(ann.current)
    }
    ann.current <- ann.next
  }
  
  warning(paste("Not converged after", iter.max, "iterations"))
  return(ann.current)
}