找了很久的bug

来源:互联网 发布:wow数据库7.0 编辑:程序博客网 时间:2024/04/18 13:01
因为vs不检测函数使用时的参数传递多了或者类型错误,可能会产生非常奇怪的bug
#pragma comment(lib, "ws2_32.lib")
#define _CRT_SECURE_NO_DEPRECATE
#define SOCK_STREAM 1
#define AF_INET 2
#define SIZE_OF_SENDBUF 10000
#define SIZE_OF_RECEIVEBUF 1000
#define REASON_SIZE  128
#define SIZE_OF_IP 128
#define SOURCE 10
#define SIZE_OF_DOMAIN_NAME 128
#define SIZE_OF_HOST 128
#define SIZE_OF_URL 100//每个url最长不超过100字节
#define NUMBER_OF_VISITED_URLS 1000
#define XUN_HUAN_CI_SHU 3
#define SIZE_OF_PATH 70
#define SIZE_OF_INFORMATION 100
struct node
{
 char url[SIZE_OF_URL];
 struct node * next;
};
typedef struct node * List;
struct hashtable
{
 List *url;
 int tablesize;
};
typedef struct hashtable *ahash;
#include <windows.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <winSock.h>
#include <string.h>
#include <mysql.h>
void add_one_in_name(MYSQL *mysql, char *name);
void collect_information(FILE *file, char *text, char *before, char *after, int max_size);
void insert_in_list(List head, char *string);
int standardization_url(char *url, char *root_domain_name, int max_size);
void delete_in_list(List head, List ptr);
void insert_in_hash(ahash ahash, char *url);
int isvisited(ahash hash, char *url);
void viste_one_cycle(FILE* collect_informations, FILE *writeto, List head, ahash hash);//只有4个参数
int hash(ahash hash, char *string);
int Nextprime(int i);
int Isprime(int i);
void get_domain_name_and_path_from_url(char *url, char *path, char *domain_name);
ahash initialize_hash(int up_limit);
List initialize_url_list(FILE *readfrom);
SOCKET build_connect(char *ID, int port);
void domain_name_to_IP(char *domain_name, char *IP);
void myexit(char * error_position);
int Socket(int family, int type, int protocal);
void Connect(SOCKET s, const struct sockaddr * name, int namelen);
int collect_new_url(FILE *file, List head, const char* root_domain_name, const char * text, const char *before, const char *after, int max_size);//获取text中before和after之间的字符串存入save_target中,如果before之后没有after,那么填满为止,返回1
int main(void)
{
 LPWSADATA wsaData;
 WSAStartup(MAKEWORD(2, 1), &wsaData);
 MYSQL * mysql = mysql_init(0);
 mysql_real_connect(mysql, "localhost", "root", "root", "webrobot", 0, 0, 0);
 FILE *readfrom = fopen("1.txt", "r");
 FILE *collect_informations = fopen("3.txt", "w");
 FILE *write_visited_url_to = fopen("2.txt", "w");
 ahash hash = initialize_hash(NUMBER_OF_VISITED_URLS);
 List head = initialize_url_list(readfrom);
 for (int i = 1; i <= XUN_HUAN_CI_SHU; i++)
  viste_one_cycle(mysql, collect_informations, write_visited_url_to, head, hash);//传了5个参数导致错位,但vs不会报错,由此产生奇怪的bug
 fclose(collect_informations);
 fclose(readfrom);
 fclose(write_visited_url_to);
 mysql_close(mysql);
 WSACleanup();
 return 0;
}
void domain_name_to_IP(char *domain_name, char *IP)
{
 struct hostent *target_server;
 target_server = gethostbyname(domain_name);
 if (target_server == NULL)
  myexit("获取DNS服务失败\n");
 struct in_addr addr;
 addr.S_un.S_addr = ((struct in_addr *)target_server->h_addr_list[0])->S_un.S_addr;
 strcpy(IP, inet_ntoa(addr));
}   
SOCKET build_connect(char * IP, int port)
{
 SOCKET s = Socket(AF_INET, SOCK_STREAM, 0);
 struct sockaddr_in serveraddr;
 memset(serveraddr.sin_zero, 0, sizeof(serveraddr.sin_zero));
 serveraddr.sin_port = htons(port);
 serveraddr.sin_family = AF_INET;
 serveraddr.sin_addr.S_un.S_addr = inet_addr(IP);
 Connect(s, (struct sockaddr *)&serveraddr, sizeof(serveraddr));
 return s;
}
void Connect(SOCKET s, const struct sockaddr * name, int namelen)
{
 if (connect(s, name, namelen) == 0)
  return;
 else
 {
  printf("%d\n", WSAGetLastError);
  myexit("与服务器建立连接失败\n");
 }
}
int Socket(int family, int type, int protocal)
{
 SOCKET s = socket(family, type, protocal);
 if (s > 0)
  return s;
 else
 {
  printf("%d", WSAGetLastError);
  myexit("创建套接字失败\n");
 }
}
void myexit(char * error_position)
{
 printf("%s", error_position);
 return;
}
int collect_new_url(FILE *file, List head, const char* root_domain_name, const char * text, const char *before, const char *after, int max_size)//获取text中before和after之间的字符串存入save_target中,如果before之后没有after,那么填满为止,返回1
{
 char *url = (char *)malloc(max_size);
 int i = 0, j = 0, k = 0, count = 0;
 for (i = 0; i < strlen(text); i++)
 {
  if (text[i] == '\0')
  {
   free(url);
   return count;
  }
  while (text[i] != before[j])
  {
   if (text[i] == '\0')
   {
    free(url);
    return count;
   }
   i++;
  }
  while (text[i] == before[j])
  {
   i++;
   j++;
   if (text[i] == '\0')
   {
    free(url);
    return count;
   }
   if (before[j] == '\0')
   {
    while (1)
    {
     if (text[i] == '\0')
     {
      free(url);
      return count;
     }
     if (text[i] == after[0])
     {
      url[k] = '\0';
      if (standardization_url(url, root_domain_name, max_size))
      {
       insert_in_list(head, url);
       fputs(url, file);
       fputc('\n', file);
      }
      k = 0;
      j = 0;//不加会有非常坑的坑
      count++;
      break;
     }
     if (k >= max_size - 1)
      break;
     url[k++] = text[i++];
    }
   }
  }
  j = 0;
 }
 free(url);
 return count;
}
ahash initialize_hash(int max_number)
{
 ahash hash = (ahash)malloc(sizeof(struct hashtable));
 hash->tablesize = Nextprime(max_number);
 hash->url = (List *)malloc(sizeof(List)*hash->tablesize);
 for (int i = 0; i < hash->tablesize; i++)
 {
  hash->url[i] = (List)malloc(sizeof(struct node));
  hash->url[i]->next = NULL;
 }
 return hash;
}
int Nextprime(int n)
{
 while (!Isprime(n++))
  ;
 return n - 1;
}
int Isprime(int n)
{
 if (n == 1)
  return 0;
 int i;
 for (i = 2; i <= n - 1; i++)
  if (n / i == 0)
   return 0;
 return 1;
}
List initialize_url_list(FILE *readfrom)
{
 List head = (List)malloc(sizeof(struct node));
 head->next = NULL;
 char url[SIZE_OF_URL];
 while (fgets(url, sizeof(url), readfrom))
 {
  List ptr = (List)malloc(sizeof(struct node));
  ptr->next = head->next;
  head->next = ptr;
  strcpy(ptr->url, url);
 }
 return head;
}
void get_domain_name_and_path_from_url(char *url, char *domain_name, char *path)
{
 puts(url);
 int i = -1;
 while (url[++i] != '/')
  domain_name[i] = url[i];
 domain_name[i] = '\0';
 i--;
 int j = 0;
 while (url[++i])
  path[j++] = url[i];
 path[j] = '\0';
 puts(domain_name);
 puts(path);
}
int isvisited(ahash ahash, char *url)
{
 int value = hash(ahash, url);
 List ptr = ahash->url[value]->next;
 while (ptr)
 {
  if (strcmp(ptr->url, url) == 0)
   return 1;
  ptr = ptr->next;
 }
 return 0;
}
void insert_in_hash(ahash ahash, char *url)
{
 int value = hash(ahash, url);
 List ptr = (List)malloc(sizeof(struct node));
 strcpy(ptr->url, url);
 ptr->next = ahash->url[value]->next;
 ahash->url[value]->next = ptr;
}
void delete_in_list(List head, List ptr)
{
 List ptr2 = head;
 while (strcmp(ptr2->next->url, ptr->url) != 0)
  ptr2 = ptr2->next;
 ptr2->next = ptr->next;
 free(ptr);
}
int hash(ahash hash, char *string)
{
 int i = -1;
 unsigned int hashvalue = 0;
 while (string[++i])
  hashvalue = (hashvalue << 5) + string[i];
 return hashvalue%hash->tablesize;
}
void insert_in_list(List head, char *string)
{
 List ptr = (List)malloc(sizeof(struct node));
 strcpy(ptr->url, string);
 ptr->next = head->next;
 head->next = ptr;
}
int standardization_url(char *url, char *root_domain_name, int max_size)
{
 if (strlen(url) <= 11 || strlen(url) >= max_size)//太长或太短的都不要
  return 0;
 if (url[0] == 'w'&&url[1] == 'w'&&url[2] == 'w'&&url[4] == '.')//如果已经是规范的,那么直接返回
  return 1;
 char *copy_url = (char *)malloc(max_size);
 strcpy(copy_url, root_domain_name);
 int i = 0, j = strlen(root_domain_name);
 if (url[0] == '/')//只识别带/的也就是path,在前面加domainname
 {
  while (url[i])
   copy_url[j++] = url[i++];
  copy_url[j] = '\0';
  strcpy(url, copy_url);
  free(copy_url);
  return 1;
 }
 if (url[0] == 'h'&&url[8] == 'w'&&url[9] == 'w')//之识别http://www.xxxxx.xx的,把http://去掉
 {
  if (url[7] == 'w')
   j = strlen("http://");
  else if (url[10] == 'w'&&url[4] == 's')
   j = strlen("https://");
  else
  {
   free(copy_url);
   return 0;
  }
  while (url[j])
   copy_url[i++] = url[j++];
  copy_url[i] = '\0';
  strcpy(url, copy_url);
  free(copy_url);
  return 1;
 }
 return 0;
}
void collect_information(FILE *file, char *text, char *before, char *after, int max_size)
{
 char *target_string = (char *)malloc(max_size);
 int i = 0, j = 0, k = 0;
 for (i = 0; i < strlen(text); i++)
 {
  if (text[i] == '\0')
  {
   free(target_string);
   return;
  }
  while (text[i] != before[j])
  {
   if (text[i] == '\0')
   {
    free(target_string);
    return;
   }
   i++;
  }
  while (text[i] == before[j])
  {
   i++;
   j++;
   if (text[i] == '\0')
   {
    free(target_string);
    return;
   }
   if (before[j] == '\0')
   {
    while (1)
    {
     if (text[i] == '\0')
     {
      free(target_string);
      return;
     }
     if (text[i] == after[0])
     {
      target_string[k] = '\0';
      // add_one_in_name(mysql, target_string);
      if (fputs(target_string, file) == EOF)
       printf("\n\n\n\n\n\n\n写入文件\n\n\n\n\n\n\n\n");
      else
       printf("\n\n\n\n\n\n\n写入文件\n\n\n\n\n\n\n\n");
      fputc('\n', file);
      k = 0;
      j = 0;//不加会有非常坑的坑
      break;
     }
     if (k >= max_size - 1)
      break;
     target_string[k++] = text[i++];
    }
   }
  }
  j = 0;
 }
 free(target_string);
}
void add_one_in_name(MYSQL *mysql, char *name)
{
 char sql[256];
 int flag = 0;
 mysql_query(mysql, "select * from robot");
 MYSQL_RES * result = mysql_store_result(mysql);
 MYSQL_ROW row;
 while (row = mysql_fetch_row(result))
 {
  if (strcmp(row[0], name) == 0)
   flag = 1;
 }
 if (flag == 1)
  sprintf(sql, "update robot set value=value+1 where name='%s'", name);
 else
  sprintf(sql, "insert into robot values('%s',1)", name);
 mysql_query(mysql, sql);
 mysql_free_result(result);
}

void viste_one_cycle(FILE *collect_informations, FILE *write_visited_url_to, List head, ahash hash)
{
 int port;
 char sendbuf[SIZE_OF_SENDBUF], receivebuf[SIZE_OF_RECEIVEBUF] = { 0 }, target_page[SOURCE];
 char IP[SIZE_OF_IP];
 char domain_name[SIZE_OF_DOMAIN_NAME];
 char path[SIZE_OF_PATH];
 List ptr = head->next;
 while (ptr)
 {
  while (isvisited(hash, ptr->url))
  {
   if (!ptr->next)
   {
    delete_in_list(head, ptr);
    ptr = NULL;
    break;
   }
   List next_ptr = ptr->next;
   delete_in_list(head, ptr);
   ptr = next_ptr;
  }
  if (!ptr)
   break;
  get_domain_name_and_path_from_url(ptr->url, domain_name, path);
  domain_name_to_IP(domain_name, IP);
  port = 80;
  SOCKET client = build_connect(IP, port);
  sprintf(sendbuf, "GET %s HTTP 1.1\r\nHost: %s\r\nAccept: *\r\nUser-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586\r\nAccept-Language: zh-CN\r\nAccept-Charset: us-ascii\r\n\r\n", path, domain_name);
  send(client, sendbuf, strlen(sendbuf) + 1, 0);
  if (recv(client, receivebuf, SIZE_OF_RECEIVEBUF, 0)>0)
  {
   puts(receivebuf);
   collect_information(collect_informations, receivebuf, "data-author-name=\"", "\"", SIZE_OF_INFORMATION);
   collect_new_url(write_visited_url_to, head, domain_name, receivebuf, "href=\"", "\"", SIZE_OF_URL);
   while (recv(client, receivebuf, SIZE_OF_RECEIVEBUF, 0) > 0)
   {
    puts(receivebuf);
    collect_information(collect_informations, receivebuf, "data-author-name=\"", "\"", SIZE_OF_INFORMATION);
    collect_new_url(write_visited_url_to, head, domain_name, receivebuf, "href=\"", "\"", SIZE_OF_URL);

   }
  }
  else
  {
   closesocket(client);
   SOCKET client = build_connect(IP, port);
   send(client, sendbuf, strlen(sendbuf) + 1, 0);
   while (recv(client, receivebuf, SIZE_OF_RECEIVEBUF, 0) > 0)
   {
    puts(receivebuf);
    collect_information(collect_informations, receivebuf, "data-author-name=\"", "\"", SIZE_OF_INFORMATION);
    collect_new_url(write_visited_url_to, head, domain_name, receivebuf, "href=\"", "\"", SIZE_OF_URL);
   }
  }
  closesocket(client);
  insert_in_hash(hash, ptr->url);
  List next_ptr = ptr->next;
  delete_in_list(head, ptr);
  ptr = next_ptr;
 }
}
0 0