/*
 *   Copyright (C) 1997, 1998, 1999 Loic Dachary
 *
 *   This program is free software; you can redistribute it and/or modify it
 *   under the terms of the GNU General Public License as published by the
 *   Free Software Foundation; either version 2, or (at your option) any
 *   later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the Free Software
 *   Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
 *
 * Store and use URL selection patterns (from robots.txt files or
 * user specified). The crawler will make use of those functions to
 * find out if a URL is elligible for crawling or not. It is also
 * a convinience library of the robot exclusion protocol module.
 *
 */
/*
 * Match a URL against Allow and Disallow robots.txt clauses
 */
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif /* HAVE_CONFIG_H */

#include <stdio.h>
#include <string.h>
#include <ctype.h>
#ifdef HAVE_MALLOC_H
#include <malloc.h>
#endif /* HAVE_MALLOC_H */
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif /* HAVE_STDLIB_H */
#ifdef HAVE_DMALLOC_H
#include <dmalloc.h>
#endif /* HAVE_DMALLOC_H */

#include <hash.h>
#include <split.h>
#include <urldirname.h>
#include <dirsel.h>
#include <uri.h>
#include <salloc.h>
#include <regex.h>                                                                                                                    
/*
 * Search for match at beginning of string (anchored)
 */
#define DIRSEL_BOL	'/'
/*
 * Search for match anywhere in string
 */
#define DIRSEL_ANY	'A'

/*
 * List of strings (allowed or disallowed). 
 * The prefix variable is a list of string.
 * The prefix_length variable is the number of strings in prefix.
 * 
 */
typedef struct dirsel {
  int info;

  /* List of strings to find in the URL */
  char** prefix;
  /* Length of the list */
  int prefix_length;

  int prefix_size;
  char* pool;
  int pool_size;
} dirsel_t;

// regexs management
enum   regex_types { ALLOW, DISALLOW };
static regex_t** regexs[2] = { NULL, NULL };
static int regex_length[2];                                                                                                           
/*
 * Instance object. Holds mapping between server names
 * and struct dirsel entries listing the allowed/disallowed strings.
 * The url_object member is only here for performance reasons.
 */
typedef struct context {
  hash_t* allow;
  hash_t* disallow;
  hash_t* regex_allow;
  hash_t* regex_disallow;
  hash_t* robots_allow;
  hash_t* robots_disallow;
  uri_t* url_object;
} context_t;

static int verbose = 0;

static context_t context;

static void dirsel_end_1(hash_t* table);
static void dirsel_insert(hash_t* table, char* url, char* dirs, int flag);
static int dirsel_allowed_1(char* comparable, dirsel_t* allow, dirsel_t* disallow);
static int dirsel_allowed_2(char* comparable, dirsel_t* regex_allow, dirsel_t* regex_disallow);
static char* dirsel_comparable(char* netpath, uri_t* url_object);
static int dirsel_match(dirsel_t* entry, char* comparable);
static int dirsel_regex_match(dirsel_t* entry, char* comparable, int regex_type);
static dirsel_t* dirsel_alloc();
static char* dirsel_netpath(uri_t* url_object);
static dirsel_t* entry_find(hash_t* table, char* key);

static void hnode_free(hnode_t *node, void *)
{
  dirsel_t* entry = (dirsel_t*)node->data;
  free(node->key);
  free(entry->prefix);
  free(entry->pool);
  free(entry);
  free(node);
}

/*
 * Create a table for robots.txt Allow/Disallow attributes + user provided Allow/Disallow
 * + user provided Regex Allow/Regex Disallow
 */
void dirsel_init()
{
  if(context.url_object == 0) {
    int size = HASHCOUNT_T_MAX;
    context.allow = hash_create(size, 0, 0);
    hash_set_allocator(context.allow, 0, hnode_free, 0);
    context.disallow = hash_create(size, 0, 0);
    hash_set_allocator(context.disallow, 0, hnode_free, 0);

    context.regex_allow = hash_create(size, 0, 0);
    hash_set_allocator(context.regex_allow, 0, hnode_free, 0);
    context.regex_disallow = hash_create(size, 0, 0);
    hash_set_allocator(context.regex_disallow, 0, hnode_free, 0);

    context.robots_allow = hash_create(size, 0, 0);
    hash_set_allocator(context.robots_allow, 0, hnode_free, 0);
    context.robots_disallow = hash_create(size, 0, 0);
    hash_set_allocator(context.robots_disallow, 0, hnode_free, 0);
    context.url_object = uri_alloc_1();
  }
}

void dirsel_end()
{
  dirsel_end_1(context.allow);
  dirsel_end_1(context.disallow);
  dirsel_end_1(context.robots_allow);
  dirsel_end_1(context.robots_disallow);
  dirsel_end_1(context.regex_allow);
  dirsel_end_1(context.regex_disallow);
  uri_free(context.url_object);
  context.url_object = 0;
  dirsel_destroy_regex (ALLOW);
  dirsel_destroy_regex (DISALLOW);
}

void dirsel_verbose(int level)
{
  verbose += level;
  fprintf(stderr, "\tdirsel_verbose: level = %d\n", level);
}

static void dirsel_end_1(hash_t* table)
{
  hash_free(table);
}

/*
 * Add a list of strings <dirs> to the user specified Allow clauses
 */
void dirsel_allow(char* url, char* dirs, int flag)
{
  char* netpath;
  uri_realloc(context.url_object, url, strlen(url));
  netpath = dirsel_netpath(context.url_object);
  if(verbose) fprintf(stderr, "\tdirsel_allow: %s (%s)\n", netpath, dirs);
  dirsel_insert(context.allow, netpath, dirs, flag);
}

/*
 * Add a list of strings <dirs> to the user specified Disallow clauses
 */
void dirsel_disallow(char* url, char* dirs, int flag)
{
  char* netpath;
  uri_realloc(context.url_object, url, strlen(url));
  netpath = dirsel_netpath(context.url_object);
  if(verbose) fprintf(stderr, "\tdirsel_disallow: %s (%s)\n", netpath, dirs);
  dirsel_insert(context.disallow, netpath, dirs, flag);
}

/*
 * Add a list of strings <dirs> to the user specified Regex Allow clauses
 * Added by Olivier Girondel (olivier.girondel@IDEALX.com, on 2000/08/01
 */
void dirsel_regex_allow(char* regex, char* dirs, int flag)
{
  char* netpath;
  dirsel_destroy_regex (ALLOW);
  uri_realloc(context.url_object, regex, strlen(regex));
  netpath = dirsel_netpath(context.url_object);
  if(verbose) fprintf(stderr, "\tdirsel_regex_allow: %s (%s)\n", netpath, dirs);
  dirsel_insert(context.regex_allow, netpath, dirs, flag);
  dirsel_build_regex (context.regex_allow, netpath, ALLOW);
}

/*
 * Add a list of strings <dirs> to the user specified Regex Disallow clauses
 * Added by Olivier Girondel (olivier.girondel@IDEALX.com, on 2000/08/01
 */
void dirsel_regex_disallow(char* regex, char* dirs, int flag)
{
  char* netpath;
  dirsel_destroy_regex (DISALLOW);
  uri_realloc(context.url_object, regex, strlen(regex));
  netpath = dirsel_netpath(context.url_object);
  if(verbose) fprintf(stderr, "\tdirsel_regex_disallow: %s (%s)\n", netpath, dirs);
  dirsel_insert(context.regex_disallow, netpath, dirs, flag);
  dirsel_build_regex (context.regex_disallow, netpath, DISALLOW);
}

/*
 * Add a list of strings <dirs> to the robots.txt Allow clauses
 */
void dirsel_robots_allow(char* netloc, char* dirs, int flag)
{
  if(verbose) fprintf(stderr, "\tdirsel_robots_allow: %s (%s)\n", netloc, dirs);
  dirsel_insert(context.robots_allow, netloc, dirs, flag);
}

/*
 * Add a list of strings <dirs> to the robots.txt Disallow clauses
 */
void dirsel_robots_disallow(char* netloc, char* dirs, int flag)
{
  if(verbose) fprintf(stderr, "\tdirsel_robots_disallow: %s (%s)\n", netloc, dirs);
  dirsel_insert(context.robots_disallow, netloc, dirs, flag);
}

/*
 * Backend of dirsel_*allow: split <dirs> and feed the structure
 */
static void dirsel_insert(hash_t* table, char* key, char* dirs, int flag)
{
  hnode_t* node = hash_lookup(table, key);
  dirsel_t* entry;

  if(!node) {
    entry = dirsel_alloc();
    hash_alloc_insert(table, strdup(key), (void*)entry);
  } else if(flag == DIRSEL_LOAD) {
    return;
  } else {
    entry = (dirsel_t*)hnode_get(node);
  }

  static_alloc(&entry->pool, &entry->pool_size, strlen(dirs) + 1);
  strcpy(entry->pool, dirs);
  {
    char** splitted;

    split_inplace(entry->pool, strlen(entry->pool), &splitted, &entry->prefix_length, ' ', SPLIT_TRIM);
    static_alloc((char**)&entry->prefix, &entry->prefix_size, entry->prefix_length * sizeof(char*));
    memcpy((char*)entry->prefix, splitted, entry->prefix_length * sizeof(char*));
  }
}

/*
 * Return true if url_object is allowed considering
 * the restrictions associated to netpath. Priority goes to robots.txt specification
 * then to user specified specifications.
 */
int dirsel_allowed(char* netpath, char* url)
{
  char* comparable;
  static char* netloc = 0;
  static int netloc_size = 0;

  if(!netpath) {
    fprintf(stderr, "dirsel_allowed: null netpath (probably crawl_touch on redirection)\n");
    return 0;
  }
  if(verbose) fprintf(stderr, "\tdirsel_allowed: netpath = %s, url = %s \n", netpath, url);

  if(uri_realloc(context.url_object, url, strlen(url)) != URI_CANNONICAL)
    return 1;

  comparable = dirsel_comparable(netpath, context.url_object);

  if(!comparable)
    return 1;

  {
    char* tmp = strchr(netpath, '/');
    static_alloc(&netloc, &netloc_size, strlen(netpath));
    strncpy(netloc, netpath, tmp - netpath);
    netloc[tmp - netpath] = '\0';
  }

  if(verbose) fprintf(stderr, "\tdirsel_allowed: comparable = %s \n", comparable);
  if(verbose) fprintf(stderr, "\tdirsel_allowed: search robots\n");
  {
    int ret = dirsel_allowed_1(comparable, entry_find(context.robots_allow, netloc), entry_find(context.robots_disallow, netloc));
    if(ret) {
      if(verbose) fprintf(stderr, "\tdirsel_allowed: search manual\n");

      ret = dirsel_allowed_1(comparable, entry_find(context.allow, netpath), entry_find(context.disallow, netpath));

      if(ret) {
        if(verbose) fprintf(stderr, "\tdirsel_allowed: search regexs\n");

        ret = dirsel_allowed_2(comparable, entry_find(context.regex_allow, netpath), entry_find(context.regex_disallow, netpath));
        }
    }
    return ret;
  }
}

/*
 * Return true if comparable is allowed according to allow/disallow specifications.
 * First allow is considered, then disallow.
 */
static int dirsel_allowed_1(char* comparable, dirsel_t* allow, dirsel_t* disallow)
{
  /*
   * In allow table, allowed
   */
  if(allow) {
    if(verbose) fprintf(stderr, "\tdirsel_allowed_1: allow table found\n");
    if(dirsel_match(allow, comparable)) {
      if(verbose) fprintf(stderr, "\tdirsel_allowed_1: match found in allow table\n");
      return 1;
    }
  }

  /*
   * Not in allow table and not in disallow, allowed
   */
  if(!disallow) {
    if(verbose) fprintf(stderr, "\tdirsel_allowed_1: no disallow table, therefore allowed\n");
    return 1;
  }
  if(verbose) fprintf(stderr, "\tdirsel_allowed_1: disallow table found for '%s'\n", comparable);

  {
    int result = dirsel_match(disallow, comparable);
    if(verbose) fprintf(stderr, "\tdirsel_allowed_1: match %sfound in disallow table\n", (result ? "" : "not "));
    return !result;
  }
}

/*
 * Return true if comparable is allowed according to regex-allow/disallow specifications.
 * First allow is considered, then disallow.
 * Added by Olivier Girondel (olivier.girondel@IDEALX.com, on 2000/08/01
 */
static int dirsel_allowed_2(char* comparable, dirsel_t* regex_allow, dirsel_t* regex_disallow)
{
  /*
   * In allow table, allowed
   */
  if(regex_allow) {
    if(verbose) fprintf(stderr, "\tdirsel_allowed_2: allow table found\n");
    if(dirsel_regex_match(regex_allow, comparable, ALLOW)) {
      if(verbose) fprintf(stderr, "\tdirsel_allowed_2: match found in allow table\n");
      return 1;
    }
  }

  /*
   * Not in allow table and not in disallow, allowed
   */
  if(!regex_disallow) {
    if(verbose) fprintf(stderr, "\tdirsel_allowed_2: no disallow table, therefore allowed\n");
    return 1;
  }
  if(verbose) fprintf(stderr, "\tdirsel_allowed_2: disallow table found for '%s'\n", comparable);

  {
    int result = dirsel_regex_match(regex_disallow, comparable, DISALLOW);
    if(verbose) fprintf(stderr, "\tdirsel_allowed_2: match %sfound in disallow table\n", (result ? "" : "not "));
    return !result;
  }
}

/*
 * Return netpath URL part into a string fit to compare with strings stored
 * in dirsel_t structures.
 */
static char* dirsel_comparable(char* netpath, uri_t* url_object)
{
  char* fake_netpath = dirsel_netpath(url_object);
  int netpath_length = strlen(netpath);

  /*
   * If they do not start the same, then do not compare.
   */
  if(strncmp(fake_netpath, netpath, netpath_length))
    return 0;

  return url_object->path;
}

/*
 * Search a prefix contained in comparable.
 */
static int dirsel_match(dirsel_t* entry, char* comparable)
{
  int i;
  for(i = 0; i < entry->prefix_length; i++) {
    /*
     * Systematicaly skip the leading /
     * If, by accident, the prefix does not start with /, take it completely.
     * If the prefix is empty, ignore it.
     */
    char method = entry->prefix[i][0] == DIRSEL_BOL ? DIRSEL_BOL : DIRSEL_ANY;
    char* prefix = (method == DIRSEL_BOL) ? entry->prefix[i] + 1 : entry->prefix[i];
    int length = strlen(prefix);
    if(verbose) fprintf(stderr, "\tdirsel_match: compare prefix %s (deduced from %s) with %s using method %s\n", prefix, entry->prefix[i], comparable, (method == DIRSEL_BOL ? "bol" : "any"));
    if(length > 0) {
      int result = 0;
      switch(method) {
      case DIRSEL_BOL:
	result = !strncmp(prefix, comparable, length);
	break;
      case DIRSEL_ANY:
	result = strstr(comparable, prefix) != 0;
	break;
      default:
	fprintf(stderr, "dirsel_match: unknown method %d\n", method);
	break;
      }
      if(result) {
	if(verbose) fprintf(stderr, "\tdirsel_match: %s match found prefix %s (method %s)\n", comparable, entry->prefix[i], (method == DIRSEL_BOL ? "bol" : "any"));
	return 1;
      }
    }
  }
  if(verbose) fprintf(stderr, "\tdirsel_match: %s no match\n", comparable);
  return 0;
}

static dirsel_t* dirsel_alloc()
{
  dirsel_t* entry = (dirsel_t*)smalloc(sizeof(dirsel_t));
  memset((char*)entry, '\0', sizeof(dirsel_t));
  static_alloc((char**)&entry->prefix, &entry->prefix_size, 10 * sizeof(char*));
  static_alloc(&entry->pool, &entry->pool_size, 32);
  return entry;
}

/*
 * Search a prefix contained in comparable.
 * Added by Olivier Girondel (olivier.girondel@IDEALX.com, on 2000/08/01
 */
static int dirsel_regex_match(dirsel_t* entry, char* comparable, int regex_type)
{
#define ERRSIZE 256
  static char error[ERRSIZE];

  /*
    Olivier: quels flags passer  regexec ? cf regexec(3)
  */

  int i;
  for(i = 0; i < regex_length[regex_type]; i++) {
    /*
     * Systematicaly skip the leading /
     * If, by accident, the prefix does not start with /, take it completely.
     * If the prefix is empty, ignore it.
     */
    if (regexs[regex_type][i]) { /* make sure the regex was correctly compiled */
      if(verbose) fprintf(stderr, "\tdirsel_regex_match: match regex %s with %s\n", entry->prefix[i], comparable);
      int result = 0;

      result = regexec (regexs[regex_type][i], comparable, 0, NULL, 0 /* flags = ? */);

      if(!result) {
        if(verbose) fprintf(stderr, "\tdirsel_regex_match: %s match found regex %s\n", comparable, entry->prefix[i]);
        return 1;
      }
      else {
        if (result == REG_ESPACE) {
          regerror (result, regexs[regex_type][i], error, ERRSIZE);
          if (verbose)
            fprintf(stderr, "\tdirsel_regex_match: failed regexec: %s\n", error);
        }
      }
    }
  }
  if(verbose) fprintf(stderr, "\tdirsel_regex_match: %s no match\n", comparable);
  return 0;
}

/*
 * Return netpath component from <url>
 */
char* dirsel_key(char* url)
{
  if(verbose) fprintf(stderr, "\tdirsel_key: %s\n", url);
  if(uri_realloc(context.url_object, url, strlen(url)) == URI_CANNONICAL) {
    if(context.url_object->info & URI_INFO_RELATIVE) {
      fprintf(stderr, "dirsel_key: unexpected relative url %s\n", url);
      return "_unlikely_";
    } else {
      return dirsel_netpath(context.url_object);
    }
  } else
    return "_unlikely_";
}

/*
 * Return netpath component from <url_object>
 */
static char* dirsel_netpath(uri_t* url_object)
{
  static char* netpath = 0;
  static int netpath_size = 0;
  char* netloc = uri_netloc(url_object);
  const char* path = url_object->path ? url_object->path : "";
  path = urldirname(path);
  if(!strcmp(path, "."))
    path = "";

  static_alloc(&netpath, &netpath_size, strlen(netloc) + strlen(path) + 3);

  sprintf(netpath, "%s/%s", netloc, path);

  return netpath;
}

static dirsel_t* entry_find(hash_t* table, char* key)
{
  hnode_t* node = hash_lookup(table, key);
  return node ? (dirsel_t*)hnode_get(node) : 0;
}


/*
 * Destroy regex
 * Added by Olivier Girondel (olivier.girondel@IDEALX.com, on 2000/08/01
 */
void dirsel_destroy_regex (int regex_type) {
  if (verbose)
    fprintf (stderr, "\tfreeing %sallow regexs ...", (regex_type) ? "dis" : "");
  if (regexs[regex_type]) {
    int i;
    for (i = 0; i < regex_length[regex_type]; i++)
      if (regexs[regex_type][i]) {
        regfree (regexs[regex_type][i]);
        free (regexs[regex_type][i]);
      }

    free (regexs[regex_type]);

    regex_length[regex_type] = 0;
    regexs[regex_type] = NULL;
  }
  if (verbose)
    fprintf (stderr, "\tdone.\n");
}


/*
 * Build regex
 * Added by Olivier Girondel (olivier.girondel@IDEALX.com, on 2000/08/01
 */
void dirsel_build_regex (hash_t* table, char* key, int regex_type) {
#define ERRSIZE 256
  static char error[ERRSIZE];

  /*
    Olivier: quels flags passer a regcomp ? cf regcomp(3)
  */

  int i;
  int ret;

  hnode_t* node = hash_lookup(table, key);
  dirsel_t* entry = (dirsel_t*)hnode_get(node);

  regex_length[regex_type] = entry->prefix_length;
  regexs[regex_type] = (regex_t**)malloc (regex_length[regex_type] * sizeof(regex_t*));


  if (verbose)
    fprintf (stderr, "\tcompiling %sallow regexs :\n", (regex_type) ? "dis" : "");

  for (i = 0; i < entry->prefix_length; i++) {
    regexs[regex_type][i] = (regex_t*)malloc (sizeof (regex_t));

    if (verbose)
      fprintf (stderr, "\t\t regex: %s ...", entry->prefix[i]);

    ret = regcomp(regexs[regex_type][i], entry->prefix[i], REG_NOSUB | REG_EXTENDED);
    if (ret) {
      if (verbose) {
        regerror (ret, regexs[regex_type][i], error, ERRSIZE);
        fprintf (stderr, "failed: %s.\n", error);
      }

      regfree (regexs[regex_type][i]);
      free (regexs[regex_type][i]);
      regexs[regex_type][i] = NULL;
    }
    else {
      if (verbose)
        fprintf (stderr, "ok.\n");
    }
  }
}
