60.大数据创建索引,并实现大文件的二分查找,迁移实现分层

Posted 喵小喵~

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了60.大数据创建索引,并实现大文件的二分查找,迁移实现分层相关的知识,希望对你有一定的参考价值。

  • index.h
     1 #define  _CRT_SECURE_NO_WARNINGS
     2 #include<stdio.h>
     3 #include<stdlib.h>
     4 #define N 10000000
     5 
     6 struct index
     7 {
     8     int *pindex;
     9     int length;
    10 };
    11 
    12 char **g_pp;//保存指针数组
    13 char filepath[256];
    14 char sortpath[256];
    15 char indexpath[256];
    16 struct index allindex;//索引
    17 
    18 int getN();//函数声明
    19 void eatg(char *str);
    20 void eatN(char *str);

     

  • index.c
     1 #include"index.h"
     2 
     3 char **g_pp = NULL;//保存指针数组
     4 char filepath[256] = { 0 };
     5 char sortpath[256] = { 0 };
     6 char indexpath[256] = { 0 };
     7 struct index allindex = { 0 };//索引
     8 
     9 int getN()
    10 {
    11     FILE *pf = fopen("file.txt", "r");
    12     if (pf == NULL)
    13     {
    14         return -1;
    15     }
    16     else
    17     {
    18         int i = 0;
    19         while (!feof(pf))
    20         {
    21             char str[50] = { 0 };
    22             fgets(str, 50, pf);//读取
    23             i++;
    24         }
    25         fclose(pf);
    26         return i;
    27 
    28     }
    29 }
    30 void eatg(char *str)
    31 {
    32     while (*str != \0)
    33     {
    34 
    35         if (*str == -)
    36         {
    37             *str = \0;
    38         }
    39         str++;
    40     }
    41 
    42 }
    43 void eatN(char *str)
    44 {
    45     while (*str != \0)
    46     {
    47         if (*str == \r || *str == \n)
    48         {
    49             *str = \0;
    50         }
    51 
    52         str++;
    53     }
    54 
    55 }

     

  • createsort.h
    1 #include "index.h"
    2 
    3 void initmem();
    4 int com(void *p1, void*p2);
    5 void sort();
    6 void show();
    7 void writetofile();

     

  • createsort.cp
     1 #include "createsort.h"
     2 void initmem()
     3 {
     4     g_pp = calloc(N, sizeof(char*));//分配指针数组
     5     FILE *pf = fopen(filepath, "r");
     6     if (pf == NULL)
     7     {
     8         return -1;
     9     }
    10     else
    11     {
    12         for (int i = 0; i < N; i++)
    13         {
    14             char str[50] = { 0 };
    15             fgets(str, 50, pf);//读取
    16             g_pp[i] = calloc(strlen(str) + 1, sizeof(char));//分配
    17             if (g_pp[i]!=NULL)
    18             {
    19                 //sprintf(g_pp[i], str);//打印进去
    20                 strcpy(g_pp[i], str);
    21                 eatN(g_pp[i]);
    22             }
    23             
    24             //printf("%s", g_pp[i]);//显示测试
    25 
    26 
    27         }
    28 
    29 
    30         fclose(pf);
    31 
    32 
    33     }
    34 
    35 
    36 
    37 
    38 
    39 
    40 }
    41 
    42 int com(void *p1, void*p2)
    43 {
    44     char **pp1 = p1;
    45     char **pp2 = p2;
    46 
    47     return strcmp(*pp1, *pp2);
    48 
    49 }
    50 
    51 void sort()
    52 {
    53     qsort(g_pp, N, sizeof(char*), com);
    54 
    55 
    56 }
    57 void show()
    58 {
    59     printf("\n此时状态\n");
    60     for (int i = 0; i < N; i++)
    61     {
    62         printf("\n%s", g_pp[i]);
    63     }
    64 }
    65 void writetofile()
    66 {
    67     FILE *pf = fopen(sortpath, "w");
    68     for (int i = 0; i < N; i++)
    69     {
    70         char temp[100] = { 0 };
    71     //    printf("\n%s", g_pp[i]);
    72         sprintf(temp, "%s\n", g_pp[i]);
    73     //    printf("\n%s", temp);
    74         fputs(temp, pf);
    75     }
    76 
    77     fclose(pf);
    78 }

     

  • createindex.h
    1 #include "index.h"
    2 void init();
    3 void qucik();

     

  • createindex.c
     1 #include "createindex.h"
     2 
     3 
     4 void init()
     5 {
     6     printf("\n索引数组开始分配");
     7     allindex.length = N;
     8     allindex.pindex = calloc(N, sizeof(int));//分配内存
     9     printf("\n索引数组完成分配");
    10 
    11     printf("\n开始读取");
    12     FILE *pf = fopen(sortpath, "rb");//\r\n->\n
    13     if (pf == NULL)
    14     {
    15         return -1;
    16     }
    17     else
    18     {
    19         int alllength = 0;
    20         for (int i = 0; i < N; i++)
    21         {
    22             char str[50] = { 0 };
    23             fgets(str, 50, pf);
    24             allindex.pindex[i] = alllength;//错位从0开始
    25 
    26             int length = strlen(str);
    27             alllength += length;
    28 
    29         }
    30 
    31         fclose(pf);
    32     }
    33     printf("\n结束读取");
    34 
    35     printf("\n开始写入");
    36     FILE *pfw = fopen(indexpath, "wb");//写入索引
    37     fwrite(allindex.pindex, sizeof(int), allindex.length, pfw);
    38     fclose(pfw);//关闭
    39     printf("\n结束写入");
    40 
    41 
    42     free(allindex.pindex);
    43 
    44 }
    45 void qucik()
    46 {
    47     printf("\n索引数组开始分配");
    48     allindex.length = N;
    49     allindex.pindex = calloc(N, sizeof(int));//分配内存
    50     printf("\n索引数组完成分配");
    51 
    52     printf("\n开始读取");
    53     FILE *pfw = fopen("index.txt", "rb");//写入索引
    54     fread(allindex.pindex, sizeof(int), allindex.length, pfw);
    55     fclose(pfw);//关闭
    56     printf("\n结束读取");
    57 }

     

  • binsearch.h
    1 #include "index.h"
    2 void binsearch(char *searchstr);

     

  • binsearch.c
     1 #include "binsearch.h"
     2 
     3 void binsearch(char *searchstr)
     4 {
     5     int tou = 0;
     6     int wei = N - 1;
     7     int flag = 0;
     8     while (tou <= wei)
     9     {
    10         int zhong = (tou + wei) / 2;
    11         char zhongstr[256] = { 0 };
    12         {
    13             FILE *pf1 = fopen(indexpath, "rb");
    14             FILE *pf2 = fopen(sortpath, "rb");
    15 
    16 
    17             int indexnum = 0;
    18             fseek(pf1, zhong*sizeof(int), SEEK_SET);
    19             fread(&indexnum, sizeof(int), 1, pf1);//读索引zhong到indexnum
    20 
    21             fseek(pf2, indexnum, SEEK_SET);
    22             fgets(zhongstr, 128, pf2);//读取
    23 
    24             fclose(pf1);
    25             fclose(pf2);
    26         }
    27         eatN(zhongstr);
    28         char pnewzhongstr[256] = { 0 };
    29         sprintf(pnewzhongstr, zhongstr);
    30         eatg(pnewzhongstr);//遇到-终止
    31         int res = strcmp(pnewzhongstr, searchstr);//1 0  -1
    32 
    33 
    34         if (res == 0)
    35         {
    36             flag = 1;
    37             printf("%s", zhongstr);
    38             break;
    39         }
    40         else if (res == 1)
    41         {
    42             wei = zhong - 1;
    43         }
    44         else
    45         {
    46             tou = zhong + 1;
    47         }
    48 
    49 
    50     }
    51 
    52 
    53     if (flag)
    54     {
    55         printf("\nfind");
    56     }
    57     else
    58     {
    59         printf("\n not find");
    60     }
    61 
    62 
    63 }

     

  • main.c
     1 #include "binsearch.h"
     2 void initall()
     3 {
     4     strcpy(filepath, "1E~001OK.txt");
     5     strcpy(sortpath, "1E~001sort.txt");
     6     strcpy(indexpath, "1E~001index.txt");
     7 
     8 }
     9 
    10 void main()
    11 {
    12     initall();
    13     //初始化内存
    14     initmem();
    15     //排序
    16     sort();
    17     //写入文件
    18     writetofile();
    19 
    20     //初始化索引
    21     init();
    22     
    23     //二分查找
    24     while (1)
    25     {
    26         char str[256] = { 0 };
    27         scanf("%s", str);
    28         binsearch(str);
    29     }
    30     system("pause");
    31 
    32 }

     

以上是关于60.大数据创建索引,并实现大文件的二分查找,迁移实现分层的主要内容,如果未能解决你的问题,请参考以下文章

大数据面试题十四 · 数据结构与算法 · 二分查找

支持60+数据传输链路,华为云DRS链路商用大盘点

说一下二分查找

14天算法入门-第1天-二分查找

python学习(二分法)

1.二分查找