找了很久的bug

来源：互联网发布：wow数据库7.0 编辑：程序博客网时间：2024/04/18 13:01

因为vs不检测函数使用时的参数传递多了或者类型错误，可能会产生非常奇怪的bug

#pragma comment(lib, "ws2_32.lib")
#define _CRT_SECURE_NO_DEPRECATE
#define SOCK_STREAM 1
#define AF_INET 2
#define SIZE_OF_SENDBUF 10000
#define SIZE_OF_RECEIVEBUF 1000
#define REASON_SIZE 128
#define SIZE_OF_IP 128
#define SOURCE 10
#define SIZE_OF_DOMAIN_NAME 128
#define SIZE_OF_HOST 128
#define SIZE_OF_URL 100//每个url最长不超过100字节
#define NUMBER_OF_VISITED_URLS 1000
#define XUN_HUAN_CI_SHU 3
#define SIZE_OF_PATH 70
#define SIZE_OF_INFORMATION 100
struct node
{
char url[SIZE_OF_URL];
struct node * next;
};
typedef struct node * List;
struct hashtable
{
List *url;
int tablesize;
};
typedef struct hashtable *ahash;
#include <windows.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <winSock.h>
#include <string.h>
#include <mysql.h>
void add_one_in_name(MYSQL *mysql, char *name);
void collect_information(FILE *file, char *text, char *before, char *after, int max_size);
void insert_in_list(List head, char *string);
int standardization_url(char *url, char *root_domain_name, int max_size);
void delete_in_list(List head, List ptr);
void insert_in_hash(ahash ahash, char *url);
int isvisited(ahash hash, char *url);
void viste_one_cycle(FILE* collect_informations, FILE *writeto, List head, ahash hash);//只有4个参数
int hash(ahash hash, char *string);
int Nextprime(int i);
int Isprime(int i);
void get_domain_name_and_path_from_url(char *url, char *path, char *domain_name);
ahash initialize_hash(int up_limit);
List initialize_url_list(FILE *readfrom);
SOCKET build_connect(char *ID, int port);
void domain_name_to_IP(char *domain_name, char *IP);
void myexit(char * error_position);
int Socket(int family, int type, int protocal);
void Connect(SOCKET s, const struct sockaddr * name, int namelen);
int collect_new_url(FILE *file, List head, const char* root_domain_name, const char * text, const char *before, const char *after, int max_size);//获取text中before和after之间的字符串存入save_target中，如果before之后没有after，那么填满为止，返回1
int main(void)
{
LPWSADATA wsaData;
WSAStartup(MAKEWORD(2, 1), &wsaData);
MYSQL * mysql = mysql_init(0);
mysql_real_connect(mysql, "localhost", "root", "root", "webrobot", 0, 0, 0);
FILE *readfrom = fopen("1.txt", "r");
FILE *collect_informations = fopen("3.txt", "w");
FILE *write_visited_url_to = fopen("2.txt", "w");
ahash hash = initialize_hash(NUMBER_OF_VISITED_URLS);
List head = initialize_url_list(readfrom);
for (int i = 1; i <= XUN_HUAN_CI_SHU; i++)
  viste_one_cycle(mysql, collect_informations, write_visited_url_to, head, hash);//传了5个参数导致错位，但vs不会报错，由此产生奇怪的bug
fclose(collect_informations);
fclose(readfrom);
fclose(write_visited_url_to);
mysql_close(mysql);
WSACleanup();
return 0;
}
void domain_name_to_IP(char *domain_name, char *IP)
{
struct hostent *target_server;
target_server = gethostbyname(domain_name);
if (target_server == NULL)
  myexit("获取DNS服务失败\n");
struct in_addr addr;
addr.S_un.S_addr = ((struct in_addr *)target_server->h_addr_list[0])->S_un.S_addr;
strcpy(IP, inet_ntoa(addr));
}
SOCKET build_connect(char * IP, int port)
{
SOCKET s = Socket(AF_INET, SOCK_STREAM, 0);
struct sockaddr_in serveraddr;
memset(serveraddr.sin_zero, 0, sizeof(serveraddr.sin_zero));
serveraddr.sin_port = htons(port);
serveraddr.sin_family = AF_INET;
serveraddr.sin_addr.S_un.S_addr = inet_addr(IP);
Connect(s, (struct sockaddr *)&serveraddr, sizeof(serveraddr));
return s;
}
void Connect(SOCKET s, const struct sockaddr * name, int namelen)
{
if (connect(s, name, namelen) == 0)
  return;
else
{
  printf("%d\n", WSAGetLastError);
  myexit("与服务器建立连接失败\n");
}
}
int Socket(int family, int type, int protocal)
{
SOCKET s = socket(family, type, protocal);
if (s > 0)
  return s;
else
{
  printf("%d", WSAGetLastError);
  myexit("创建套接字失败\n");
}
}
void myexit(char * error_position)
{
printf("%s", error_position);
return;
}
int collect_new_url(FILE *file, List head, const char* root_domain_name, const char * text, const char *before, const char *after, int max_size)//获取text中before和after之间的字符串存入save_target中，如果before之后没有after，那么填满为止，返回1
{
char *url = (char *)malloc(max_size);
int i = 0, j = 0, k = 0, count = 0;
for (i = 0; i < strlen(text); i++)
{
  if (text[i] == '\0')
  {
   free(url);
   return count;
  }
  while (text[i] != before[j])
  {
   if (text[i] == '\0')
   {
    free(url);
    return count;
   }
   i++;
  }
  while (text[i] == before[j])
  {
   i++;
   j++;
   if (text[i] == '\0')
   {
    free(url);
    return count;
   }
   if (before[j] == '\0')
   {
    while (1)
    {
     if (text[i] == '\0')
     {
      free(url);
      return count;
     }
     if (text[i] == after[0])
     {
      url[k] = '\0';
      if (standardization_url(url, root_domain_name, max_size))
      {
       insert_in_list(head, url);
       fputs(url, file);
       fputc('\n', file);
      }
      k = 0;
      j = 0;//不加会有非常坑的坑
      count++;
      break;
     }
     if (k >= max_size - 1)
      break;
     url[k++] = text[i++];
    }
   }
  }
  j = 0;
}
free(url);
return count;
}
ahash initialize_hash(int max_number)
{
ahash hash = (ahash)malloc(sizeof(struct hashtable));
hash->tablesize = Nextprime(max_number);
hash->url = (List *)malloc(sizeof(List)*hash->tablesize);
for (int i = 0; i < hash->tablesize; i++)
{
  hash->url[i] = (List)malloc(sizeof(struct node));
  hash->url[i]->next = NULL;
}
return hash;
}
int Nextprime(int n)
{
while (!Isprime(n++))
  ;
return n - 1;
}
int Isprime(int n)
{
if (n == 1)
  return 0;
int i;
for (i = 2; i <= n - 1; i++)
  if (n / i == 0)
   return 0;
return 1;
}
List initialize_url_list(FILE *readfrom)
{
List head = (List)malloc(sizeof(struct node));
head->next = NULL;
char url[SIZE_OF_URL];
while (fgets(url, sizeof(url), readfrom))
{
  List ptr = (List)malloc(sizeof(struct node));
  ptr->next = head->next;
  head->next = ptr;
  strcpy(ptr->url, url);
}
return head;
}
void get_domain_name_and_path_from_url(char *url, char *domain_name, char *path)
{
puts(url);
int i = -1;
while (url[++i] != '/')
  domain_name[i] = url[i];
domain_name[i] = '\0';
i--;
int j = 0;
while (url[++i])
  path[j++] = url[i];
path[j] = '\0';
puts(domain_name);
puts(path);
}
int isvisited(ahash ahash, char *url)
{
int value = hash(ahash, url);
List ptr = ahash->url[value]->next;
while (ptr)
{
  if (strcmp(ptr->url, url) == 0)
   return 1;
  ptr = ptr->next;
}
return 0;
}
void insert_in_hash(ahash ahash, char *url)
{
int value = hash(ahash, url);
List ptr = (List)malloc(sizeof(struct node));
strcpy(ptr->url, url);
ptr->next = ahash->url[value]->next;
ahash->url[value]->next = ptr;
}
void delete_in_list(List head, List ptr)
{
List ptr2 = head;
while (strcmp(ptr2->next->url, ptr->url) != 0)
  ptr2 = ptr2->next;
ptr2->next = ptr->next;
free(ptr);
}
int hash(ahash hash, char *string)
{
int i = -1;
unsigned int hashvalue = 0;
while (string[++i])
  hashvalue = (hashvalue << 5) + string[i];
return hashvalue%hash->tablesize;
}
void insert_in_list(List head, char *string)
{
List ptr = (List)malloc(sizeof(struct node));
strcpy(ptr->url, string);
ptr->next = head->next;
head->next = ptr;
}
int standardization_url(char *url, char *root_domain_name, int max_size)
{
if (strlen(url) <= 11 || strlen(url) >= max_size)//太长或太短的都不要
  return 0;
if (url[0] == 'w'&&url[1] == 'w'&&url[2] == 'w'&&url[4] == '.')//如果已经是规范的，那么直接返回
  return 1;
char *copy_url = (char *)malloc(max_size);
strcpy(copy_url, root_domain_name);
int i = 0, j = strlen(root_domain_name);
if (url[0] == '/')//只识别带/的也就是path，在前面加domainname
{
  while (url[i])
   copy_url[j++] = url[i++];
  copy_url[j] = '\0';
  strcpy(url, copy_url);
  free(copy_url);
  return 1;
}
if (url[0] == 'h'&&url[8] == 'w'&&url[9] == 'w')//之识别http：//www.xxxxx.xx的，把http://去掉
{
  if (url[7] == 'w')
   j = strlen("http://");
  else if (url[10] == 'w'&&url[4] == 's')
   j = strlen("https://");
  else
  {
   free(copy_url);
   return 0;
  }
  while (url[j])
   copy_url[i++] = url[j++];
  copy_url[i] = '\0';
  strcpy(url, copy_url);
  free(copy_url);
  return 1;
}
return 0;
}
void collect_information(FILE *file, char *text, char *before, char *after, int max_size)
{
char *target_string = (char *)malloc(max_size);
int i = 0, j = 0, k = 0;
for (i = 0; i < strlen(text); i++)
{
  if (text[i] == '\0')
  {
   free(target_string);
   return;
  }
  while (text[i] != before[j])
  {
   if (text[i] == '\0')
   {
    free(target_string);
    return;
   }
   i++;
  }
  while (text[i] == before[j])
  {
   i++;
   j++;
   if (text[i] == '\0')
   {
    free(target_string);
    return;
   }
   if (before[j] == '\0')
   {
    while (1)
    {
     if (text[i] == '\0')
     {
      free(target_string);
      return;
     }
     if (text[i] == after[0])
     {
      target_string[k] = '\0';
      // add_one_in_name(mysql, target_string);
      if (fputs(target_string, file) == EOF)
       printf("\n\n\n\n\n\n\n写入文件\n\n\n\n\n\n\n\n");
      else
       printf("\n\n\n\n\n\n\n写入文件\n\n\n\n\n\n\n\n");
      fputc('\n', file);
      k = 0;
      j = 0;//不加会有非常坑的坑
      break;
     }
     if (k >= max_size - 1)
      break;
     target_string[k++] = text[i++];
    }
   }
  }
  j = 0;
}
free(target_string);
}
void add_one_in_name(MYSQL *mysql, char *name)
{
char sql[256];
int flag = 0;
mysql_query(mysql, "select * from robot");
MYSQL_RES * result = mysql_store_result(mysql);
MYSQL_ROW row;
while (row = mysql_fetch_row(result))
{
  if (strcmp(row[0], name) == 0)
   flag = 1;
}
if (flag == 1)
  sprintf(sql, "update robot set value=value+1 where name='%s'", name);
else
  sprintf(sql, "insert into robot values('%s',1)", name);
mysql_query(mysql, sql);
mysql_free_result(result);
}

void viste_one_cycle(FILE *collect_informations, FILE *write_visited_url_to, List head, ahash hash)
{
int port;
char sendbuf[SIZE_OF_SENDBUF], receivebuf[SIZE_OF_RECEIVEBUF] = { 0 }, target_page[SOURCE];
char IP[SIZE_OF_IP];
char domain_name[SIZE_OF_DOMAIN_NAME];
char path[SIZE_OF_PATH];
List ptr = head->next;
while (ptr)
{
  while (isvisited(hash, ptr->url))
  {
   if (!ptr->next)
   {
    delete_in_list(head, ptr);
    ptr = NULL;
    break;
   }
   List next_ptr = ptr->next;
   delete_in_list(head, ptr);
   ptr = next_ptr;
  }
  if (!ptr)
   break;
  get_domain_name_and_path_from_url(ptr->url, domain_name, path);
  domain_name_to_IP(domain_name, IP);
  port = 80;
  SOCKET client = build_connect(IP, port);
  sprintf(sendbuf, "GET %s HTTP 1.1\r\nHost: %s\r\nAccept: *\r\nUser-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586\r\nAccept-Language: zh-CN\r\nAccept-Charset: us-ascii\r\n\r\n", path, domain_name);
  send(client, sendbuf, strlen(sendbuf) + 1, 0);
  if (recv(client, receivebuf, SIZE_OF_RECEIVEBUF, 0)>0)
  {
   puts(receivebuf);
   collect_information(collect_informations, receivebuf, "data-author-name=\"", "\"", SIZE_OF_INFORMATION);
   collect_new_url(write_visited_url_to, head, domain_name, receivebuf, "href=\"", "\"", SIZE_OF_URL);
   while (recv(client, receivebuf, SIZE_OF_RECEIVEBUF, 0) > 0)
   {
    puts(receivebuf);
    collect_information(collect_informations, receivebuf, "data-author-name=\"", "\"", SIZE_OF_INFORMATION);
    collect_new_url(write_visited_url_to, head, domain_name, receivebuf, "href=\"", "\"", SIZE_OF_URL);

   }
  }
  else
  {
   closesocket(client);
   SOCKET client = build_connect(IP, port);
   send(client, sendbuf, strlen(sendbuf) + 1, 0);
   while (recv(client, receivebuf, SIZE_OF_RECEIVEBUF, 0) > 0)
   {
    puts(receivebuf);
    collect_information(collect_informations, receivebuf, "data-author-name=\"", "\"", SIZE_OF_INFORMATION);
    collect_new_url(write_visited_url_to, head, domain_name, receivebuf, "href=\"", "\"", SIZE_OF_URL);
   }
  }
  closesocket(client);
  insert_in_hash(hash, ptr->url);
  List next_ptr = ptr->next;
  delete_in_list(head, ptr);
  ptr = next_ptr;
}
}

0 0