#' LinkExtractor
#'
#' A function that take a _charachter_ url as input, fetches its html document, and extract all links following a set of rules.
#' @param url character, url to fetch and extract links.
#' @param id  numeric, an id to identify a specific web page in a website collection, it's auto-generated by default
#' @param lev numeric, the depth level of the web page, auto-generated by the \code{Rcrawler} function.
#' @param IndexErrPages character vector, vector of html error code-statut to process, by default it's c(200),eg to include 404 and 403 pages c(404,403)
#' @param Useragent , default to "Rcrawler"
#' @param Timeout ,default to 5s
#' @param URLlenlimit interger, the url character length limit to index, default to 255 characters (to avoid spider traps)
#' @param urlExtfilter character vector, the list of file extensions to exclude from indexing, by dfault a large list is defined (html pages only are permitted) in order to prevent large files downloading; To define your own use c(ext1,ext2,ext3 ...)
#' @param ExternalLInks boolean, default FALSE, if set to TRUE external links also are returned.
#' @param urlbotfiler character vector , directories/files restricted by robot.txt
#' @param encod character, specify the encoding of th web page
#' @param removeparams character vector, list of url parameters to be removed/ignored
#' @return return a list of three elements, the first is a list containing the web page details (url, encoding-type, content-type, content ... etc), the second is a character-vector containing the list of retreived urls and the third is a vetor of external Urls scraped from the page.
#' @author salim khalil
#' @import httr xml2 data.table
#' @examples
#'
#' pageinfo<-LinkExtractor(url="http://www.glofile.com", ExternalLInks = TRUE)
#' #Pageinfo handle page header detail, as well as content, and internal links.
#' #pageinfo[[1]][[10]] : page content
#' #pageinfo[[2]] : Internal hyperlinks
#' #pageinfo[[3]] : External hyperlinks
#'
#' @export

LinkExtractor <- function(url, id, lev, IndexErrPages, Useragent, Timeout=6, URLlenlimit=255,
                          urlExtfilter, encod, urlbotfiler, removeparams, ExternalLInks=FALSE) {
  nblinks<-0
  pageinfo<-list()
  links2<- vector()
  Extlinks<- vector()
  if (missing(removeparams)) removeparams<-""
  if (missing(urlbotfiler)) urlbotfiler<-" "
  if (missing(id)) id<-sample(1:1000, 1)
  if (missing(lev)) lev<-sample(1:1000, 1)
  if (missing(IndexErrPages)) errstat<-c(200)
  else errstat<-c(200,IndexErrPages)
  if(missing(Useragent)) Useragent="Mozilla/5.0 (Windows NT 6.3; WOW64; rv:42.0) Firefox/42.0"
  if(missing(urlExtfilter)) urlExtfilter<-c("flv","mov","swf","txt","xml","js","css","zip","gz","rar","7z","tgz","tar","z","gzip","bzip","tar","mp3","mp4","aac","wav","au","wmv","avi","mpg","mpeg","pdf","doc","docx","xls","xlsx","ppt","pptx","jpg","jpeg","png","gif","psd","ico","bmp","odt","ods","odp","odb","odg","odf")
  #page<-NULL
  page<-tryCatch(GET(url, user_agent(Useragent),timeout(Timeout)) , error=function(e) NULL)

  # 1 if domain exist (could resolve host name)
  if (!is.null(page)){
    # 2 if page exist (not 404,301,302,500,503,403)
    if(page$status_code %in% errstat){
      # 4 if page content is html
      if(grepl("html",page$headers$`content-type`)){
        if (missing(encod)){
        x<-as.character(content(page, type = "htmlTreeParse", as="text", encoding = "UTF-8"))
        cont<-x
        } else {
        x<-as.character(content(page, type = "htmlTreeParse", as="text", encoding = encod))
        cont<-x
        }
        if(is.na(cont)){
          x<-as.character(content(page, type = "htmlTreeParse", as="text", encoding = "ISO-8859-1"))
          cont<-x
        }
        x<-read_html(x)
        links<-xml2::xml_find_all(x, "//a/@href")
        links<-as.vector(paste(links))
        links<-gsub(" href=\"(.*)\"", "\\1", links)
        links<-unique(links)
        domain0<- strsplit(gsub("http://|https://|www\\.", "", url), "/")[[c(1, 1)]]
        domain<- paste(domain0, "/", sep="")
        # Link canonicalization
        links<-LinkNormalization(links,url)
        # Ignore Url parameters
        links<-sapply(links , function(x) Linkparamsfilter(x, removeparams), USE.NAMES = FALSE)
        links<-unique(links)
        # Link robots.txt filter
        if (!missing(urlbotfiler)){
       links<-links[!links %like% paste(urlbotfiler,collapse="|") ]
        }
        if(length(links)!=0) {
          for(s in 1:length(links)){
            if (!is.na(links[s])){
              #limit length URL to 255
              if( nchar(links[s])<=URLlenlimit) {
                ext<-tools::file_ext(sub("\\?.+", "", basename(links[s])))
                # 6 Filtre eliminer les liens externes , le lien source lui meme, les lien avec diese et les liens deja dans dans liste ( evite double), les types de fichier filtrer, les lien tres longs , les liens de type share
                #&& !(url==links[s])
                if(grepl(domain,links[s]) && !(links[s] %in% links2) && !(ext %in% urlExtfilter)){
                  links2<-c(links2,links[s])
                #calcul de nombre des liens OUT
                  nblinks<-nblinks+1
                }
                 if(ExternalLInks){
                   if ( !grepl(domain,links[s]) && !(links[s] %in% Extlinks) && !(ext %in% urlExtfilter)){
                      Extlinks<-c(Extlinks,links[s])
                   }
                 }  else{
                   Extlinks <- vector()
                   }
              }
            }
          }
        } else {
          links2 <- vector()
          Extlinks <- vector()
        }
      } else {links2 <- vector()
              cont<-"NULL"
              Extlinks <- vector()
              }
    } else {links2 <- vector()
            cont<-"NULL"
            Extlinks <- vector()
            }
    #Ligne - page detail
    contenttype<-tryCatch(gsub("(.*)\\;.*", "\\1", page$headers$`content-type`), error=function(e) "NA")
    contentencod<-tryCatch(gsub("(.*)=(.*)","\\2", gsub(".*\\;.", "\\1", page$headers$`content-type`)), error=function(e) "NA")
    pageinfo<-list(id,url,"finished",lev,nblinks,"", page$status_code, contenttype, contentencod, cont)
    }else {
      links2 <- vector()
      Extlinks <- vector()
      pageinfo<-list(id,url,"NULL",lev,"","","","","")
    }
  paquet<-list(pageinfo,links2,Extlinks)
  return(paquet)
}

