#' Identify Key Routes in Citation Networks
#'
#' This function identifies and visualizes key citation routes within scientific
#' networks by analyzing the most significant citation paths between publications.
#' The algorithm implements the key-route search from the integrated main path
#' analysis approach described in Liu & Lu (2012).
#'
#' @param network A network object of class `tbl_graph` or `igraph` containing
#'   citation data, or a list object generated by `sniff_groups()` when
#'   `scope = "groups"`
#' @param scope Character string specifying the analysis scope. Must be either
#'   "network" (for full network analysis) or "groups" (for group-wise analysis
#'   of a grouped network)
#' @param citations_percentage Numeric value between 0 and 1 indicating the
#'   percentage of top SPC edges eligible for the key-route path.
#'   Default is 1 (all edges)
#'
#' @return A list containing for each group:
#' \itemize{
#'   \item \code{plot} - A ggplot2 object visualizing the key citation route
#'   \item \code{data} - A tibble with publication details (name, TI, AU, PY)
#'         of nodes in the key route
#' }
#'
#' @details
#' The function implements the key-route search from Liu & Lu (2012):
#' \enumerate{
#'   \item Computes Search Path Count (SPC) for each citation link using an
#'         efficient O(V+E) algorithm based on topological sort. SPC measures
#'         how many source-to-sink paths traverse each link.
#'   \item Selects the key-route: the link with the highest SPC value.
#'   \item Searches forward from the end node of the key-route, greedily
#'         following the outgoing link with the highest SPC, until a sink is reached.
#'   \item Searches backward from the start node of the key-route, greedily
#'         following the incoming link with the highest SPC, until a source is reached.
#' }
#'
#' The SPC is computed as \code{forward[u] * backward[v]} for each edge (u, v),
#' where \code{forward[u]} counts paths from any source to u and \code{backward[v]}
#' counts paths from v to any sink (Batagelj, 2003). This guarantees the most
#' significant link is always included in the key-route path.
#'
#' @references
#' Liu JS, Lu LYY. An integrated approach for main path analysis: Development
#' of the Hirsch index as an example. Journal of the American Society for
#' Information Science and Technology. 2012;63(3):528-542.
#' \doi{10.1002/asi.21692}
#'
#' Batagelj V. Efficient algorithms for citation network analysis.
#' University of Ljubljana, Institute of Mathematics, Physics and Mechanics,
#' Department of Theoretical Computer Science, Preprint Series. 2003;41:897.
#'
#' @examples
#' \dontrun{
#' # Example with network scope
#' result <- sniff_key_route(my_network, scope = "network", citations_percentage = 0.8)
#'
#' # Example with groups scope
#' grouped_network <- sniff_groups(data)
#' result <- sniff_key_route(grouped_network, scope = "groups")
#'
#' # Access results for a specific group
#' result$group_name$plot
#' result$group_name$data
#' }
#'
#' @export
#' @importFrom igraph V E ends degree vcount ecount incident is_dag is_directed
#' @importFrom igraph topo_sort graph_from_edgelist feedback_arc_set delete_edges
#' @importFrom tidygraph activate
#' @importFrom dplyr filter mutate pull select
#' @importFrom tibble as_tibble
#' @importFrom stats na.omit quantile
#' @importFrom glue glue
sniff_key_route <- function(network, scope = "network", citations_percentage = 1) {
  # Input validation
  if (is.null(network)) {
    stop("Network data not found in groups object", call. = FALSE)
  }

  required_scope <- c("network", "groups")
  if (!scope %in% required_scope) {
    stop(glue::glue("scope must be: {paste(required_scope, collapse = ' or ')}"), call. = FALSE)
  }

  if (scope == "groups") {
    list_dimensions <- c("network", "pubs_by_year", "aggregate")
    if (!all(list_dimensions %in% names(network))) {
      stop(glue::glue("network file must be generated by sniff_groups()"), call. = FALSE)
    }
    net_data <- network$network
  } else {
    if (!inherits(network, c("tbl_graph", "igraph"))) {
      stop("Input (network) must be a network object (tbl_graph or igraph)", call. = FALSE)
    }

    network |>
      tidygraph::activate(nodes) |>
      dplyr::mutate(group = "full_network") ->
      net_data
  }

  # Verify this is a directed (citation) network
  network_type <- igraph::V(net_data)$NT
  if (!is.null(network_type) && length(network_type) > 0 && network_type[[1]] != 'direct-citation') {
    stop("Input (network) must be a direct citation network (type = 'direct citation')", call. = FALSE)
  }
  if (!igraph::is_directed(net_data)) {
    stop("Input (network) must be a directed network. Key-route analysis requires a direct citation network.", call. = FALSE)
  }

  if (!is.numeric(citations_percentage) || citations_percentage <= 0 || citations_percentage > 1) {
    stop("citations_percentage must be a number between 0 (exclusive) and 1 (inclusive)", call. = FALSE)
  }

  # Get unique groups
  group <- tryCatch(
    {
      net_data |>
        tidygraph::activate(nodes) |>
        tibble::as_tibble() |>
        dplyr::pull("group") |>
        stats::na.omit() |>
        unique() |>
        sort()
    },
    error = function(e) {
      stop("Error extracting groups from network: ", e$message, call. = FALSE)
    }
  )

  if (length(group) == 0) {
    stop("No valid groups found for analysis", call. = FALSE)
  }

  res <- vector(mode = "list", length = length(group))
  names(res) <- group

  for (grp in seq_along(group)) {
    message("Processing key-route for group: ", group[[grp]])

    net_data |>
      tidygraph::activate(nodes) |>
      dplyr::filter(.data$group == group[[grp]]) ->
      net_data_group

    n_nodes <- igraph::vcount(net_data_group)
    n_edges <- igraph::ecount(net_data_group)

    if (n_edges == 0) {
      warning("No edges found for group: ", group[[grp]], " - skipping")
      next
    }

    # Ensure DAG: remove cycle-causing edges if necessary
    if (!igraph::is_dag(net_data_group)) {
      cycle_edges <- igraph::feedback_arc_set(net_data_group)
      n_removed <- length(cycle_edges)
      message("Removed ", n_removed, " cycle-causing edge(s) to create DAG for group: ", group[[grp]])
      net_data_group <- igraph::delete_edges(net_data_group, cycle_edges) |>
        tidygraph::as_tbl_graph()
      n_nodes <- igraph::vcount(net_data_group)
      n_edges <- igraph::ecount(net_data_group)
      if (n_edges == 0) {
        warning("No edges remaining after cycle removal for group: ", group[[grp]], " - skipping")
        next
      }
    }

    # --- Efficient SPC computation O(V + E) ---
    # Based on Batagelj (2003): for a DAG, SPC(u->v) = forward[u] * backward[v]
    # where forward[u] = number of paths from any source to u
    # and backward[v] = number of paths from v to any sink

    el <- igraph::ends(net_data_group, igraph::E(net_data_group), names = FALSE)
    from_idx <- el[, 1]
    to_idx <- el[, 2]

    topo <- as.integer(igraph::topo_sort(net_data_group, mode = "out"))

    # Forward count: paths from sources to each node
    forward_count <- numeric(n_nodes)
    forward_count[igraph::degree(net_data_group, mode = "in") == 0] <- 1

    for (v in topo) {
      in_edges <- as.integer(igraph::incident(net_data_group, v, mode = "in"))
      if (length(in_edges) > 0) {
        forward_count[v] <- sum(forward_count[from_idx[in_edges]])
      }
    }

    # Backward count: paths from each node to sinks
    backward_count <- numeric(n_nodes)
    backward_count[igraph::degree(net_data_group, mode = "out") == 0] <- 1

    for (v in rev(topo)) {
      out_edges <- as.integer(igraph::incident(net_data_group, v, mode = "out"))
      if (length(out_edges) > 0) {
        backward_count[v] <- sum(backward_count[to_idx[out_edges]])
      }
    }

    # SPC for each edge
    spc <- forward_count[from_idx] * backward_count[to_idx]

    # --- Key-route search (Liu & Lu, 2012) ---

    # Filter eligible edges by SPC threshold
    if (citations_percentage < 1) {
      spc_cutoff <- stats::quantile(spc, probs = 1 - citations_percentage)
      eligible <- spc >= spc_cutoff
    } else {
      eligible <- rep(TRUE, n_edges)
    }

    # Step 1: Select the key-route (edge with highest SPC among eligible)
    eligible_spc <- ifelse(eligible, spc, -1)
    key_edge <- which.max(eligible_spc)
    key_route_edges <- key_edge
    visited_nodes <- c(from_idx[key_edge], to_idx[key_edge])

    # Step 2: Search forward from end node of key-route until a sink
    current_node <- to_idx[key_edge]

    repeat {
      out_edges <- as.integer(igraph::incident(net_data_group, current_node, mode = "out"))
      out_edges <- out_edges[eligible[out_edges]]
      if (length(out_edges) == 0) break
      best_edge <- out_edges[which.max(spc[out_edges])]
      next_node <- to_idx[best_edge]
      if (next_node %in% visited_nodes) break
      key_route_edges <- c(key_route_edges, best_edge)
      visited_nodes <- c(visited_nodes, next_node)
      current_node <- next_node
    }

    # Step 3: Search backward from start node of key-route until a source
    current_node <- from_idx[key_edge]

    repeat {
      in_edges <- as.integer(igraph::incident(net_data_group, current_node, mode = "in"))
      in_edges <- in_edges[eligible[in_edges]]
      if (length(in_edges) == 0) break
      best_edge <- in_edges[which.max(spc[in_edges])]
      next_node <- from_idx[best_edge]
      if (next_node %in% visited_nodes) break
      key_route_edges <- c(best_edge, key_route_edges)
      visited_nodes <- c(visited_nodes, next_node)
      current_node <- next_node
    }

    # --- Build result graph and visualization ---

    # Build edge list from key-route edges using vertex names
    vertex_names <- igraph::V(net_data_group)$name
    route_el <- cbind(
      vertex_names[from_idx[key_route_edges]],
      vertex_names[to_idx[key_route_edges]]
    )

    node_graph <- igraph::graph_from_edgelist(route_el, directed = TRUE)

    # Assign SPC as edge weight for visualization
    igraph::E(node_graph)$spc <- spc[key_route_edges]

    # Extract publication data for labeling
    data_source <- gsub("_.*$", "", igraph::V(net_data_group)$DB[[1]])

    if (data_source == 'openalex') {
      net_data_group |>
        tidygraph::activate(nodes) |>
        dplyr::filter(.data$name %in% igraph::V(node_graph)$name) |>
        dplyr::select(.data$name, .data$TI, .data$AU, .data$PY) |>
        dplyr::mutate(name2 = paste(stringr::word(gsub("\\|.*", "", .data$AU), -1), .data$PY, sep = "_")) |>
        tibble::as_tibble() |>
        dplyr::select(.data$name, .data$name2, .data$TI) ->
        data_path
    } else {
      net_data_group |>
        tidygraph::activate(nodes) |>
        dplyr::filter(.data$name %in% igraph::V(node_graph)$name) |>
        dplyr::select(.data$name, .data$TI, .data$AU, .data$PY) |>
        dplyr::mutate(name2 = paste(stringr::word(sub(' ', '-', gsub("\\,.*", "", .data$AU)), 1), .data$PY, sep = "_")) |>
        tibble::as_tibble() |>
        dplyr::select(.data$name, .data$name2, .data$TI) ->
        data_path
    }

    node_graph |>
      tidygraph::as_tbl_graph() |>
      dplyr::left_join(data_path, by = 'name') |>
      ggraph::ggraph(layout = "tree") +
      ggraph::geom_edge_link(color = "gray50", width = 1) +
      ggraph::geom_node_point(color = "steelblue", size = 4) +
      ggrepel::geom_text_repel(ggplot2::aes(x = x, y = y, label = .data$name2),
        size = 4,
        min.segment.length = 0,
        box.padding = 0.5,
        max.overlaps = Inf
      ) +
      ggplot2::scale_y_reverse() +
      ggraph::theme_graph() +
      ggplot2::theme(plot.margin = ggplot2::unit(c(1, 1, 2, 1), "cm")) ->
      krp

    res[[grp]] <- list(plot = krp, data = data_path)
  }

  return(res)
}
