URL 解析极简版

Posted 车斗

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了URL 解析极简版相关的知识,希望对你有一定的参考价值。

URL 解析极简版

网上找了很多,要么太复杂,要么有BUG。我写了一个解析URL的代码,从头到尾不分配内存,只有一个头文件:urlcodec.h。

/**
* @file urlcodec.h
* @brief url encode and decode
*    https://www.bejson.com/enc/urlencode/
* @author zhang
* @version
* @date 2021
* @note
* @since 2015
*/
#ifndef URL_CODEC_INCLUDED
#define URL_CODEC_INCLUDED

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
// #include <assert.h>

#ifdef __cplusplus
extern "C" {
#endif

static int url_param_print(int paramid, void *cbarg, int namelen, const char *name, int valuelen, const char *value)
{
    printf("  :param%d {%.*s=%.*s}\\n", paramid, namelen, name, valuelen, value);

    /* 0: break,  1: continue */
    return 1;
}


/**
 *  const char testUrl[] = "https://cn.bing.com/search?q=news&form=QBLH&sp=-1&pq=news&sc=8-4&qs=n&sk=&cvid=AFDF6C399DF44C41977CB0ED4AE87B33";
 *
 *  char encodedUrl[256];
 *  char decodedUrl[256];
 *
 *  size_t outsize = url_encode(testUrl, encodedUrl, sizeof(encodedUrl));
 *  if (outsize < sizeof(encodedUrl)) {
 *      printf("success url_encode(): {%s}\\n", encodedUrl);
 *
 *      outsize = url_decode(encodedUrl, decodedUrl, sizeof(decodedUrl));
 *      if (outsize < sizeof(decodedUrl)) {
 *          printf("success url_decode(): {%s}\\n", decodedUrl);
 *
 *          if (memcmp(testUrl, decodedUrl, outsize)) {
 *              printf("fatal error: urlcodec.h has bugs!\\n");
 *          }
 *      } else {
 *          printf("failed url_decode: required outbuf at least: %d bytes\\n", (int)outsize);
 *      }
 *  } else {
 *      printf("failed url_encode: required outbuf at least: %d bytes\\n", (int)outsize);
 *  }
 */

static size_t url_encode(const char *url, char *outbuf, size_t bufsize)
{
    static const char hex_char_table[] = "0123456789ABCDEF";

    char *p = (char *) url;
    char *buf = outbuf;
    size_t blen = 0;

    while (*p) {
        if (isalnum(*p) || *p == '-' || *p == '_' || *p == '.' ||  *p == '~') {
            blen++;

            if ((size_t)(buf - outbuf) < bufsize) {
                *buf++ = *p;
            }
        } else if (*p == ' ') {
            blen++;

            if ((size_t)(buf - outbuf) < bufsize) {
                *buf++ = '+';
            }
        } else {
            blen += 3;

            if ((size_t)(buf - outbuf + 2) < bufsize) {
                *buf++ = '%';
                *buf++ = hex_char_table[(*p >> 4) & 15];
                *buf++ = hex_char_table[(*p & 15) & 15];
            }
        }
        p++;
    }

    blen++;

    if ((size_t)(buf - outbuf) < bufsize) {
        *buf = '\\0';
    }

    return blen;
}


static size_t url_decode(const char *encurl, char *outbuf, size_t bufsize)
{
#define url_decode_from_hex(ch)   ((char)(isdigit(ch)? ((ch) - '0') : (tolower(ch) - 'a' + 10)))
    char *p = (char *) encurl;
    char *buf = outbuf;
    size_t blen = 0;

    while (*p) {
        if (*p == '%') {
            if (p[1] && p[2]) {
                blen++;

                if ((size_t)(buf - outbuf) < bufsize) {
                    *buf++ = url_decode_from_hex(p[1]) << 4 | url_decode_from_hex(p[2]);
                }

                p += 2;
            }
        } else if (*p == '+') {
            blen++;

            if ((size_t)(buf - outbuf) < bufsize) {
                *buf++ = ' ';
            }
        } else {
            blen++;

            if ((size_t)(buf - outbuf) < bufsize) {
                *buf++ = *p;
            }
        }
        p++;
    }

    blen++;

    if ((size_t)(buf - outbuf) < bufsize) {
        *buf = '\\0';
    }

    return blen;
#undef url_decode_from_hex
}


/**
 * protocol://user:password@host:port/directory/file.extension
 * https://john:hello@www.aspxfans.com:8080/world/news/index.asp?boardID=5&ID=24618&page=1#name
 */
typedef struct {
    char *protocol;    /* [mandatory] https */
    int protocollen;   /* 5 */

    char *user;        /* <optional> john */
    int userlen;       /* 4 */

    char *password;    /* <optional> hello */
    int passwordlen;

    char *host;        /* [mandatory] www.aspxfans.com */
    int hostlen;

    char *port;        /* <optional> 8080 */
    int portlen;

    char *path;        /* <optional> /world/news/  */
    int pathlen;

    char *file;        /* <optional> index.asp */
    int filelen;

    char *query;       /* <optional> boardID=5&ID=24618&page=1 */
    int querylen;

    char *fragment;    /* <optional> name */
    int fragmentlen;
} url_parts_t;


/**
 * URL Specification: https://blog.csdn.net/fjb2080/article/details/80552213
 */
static int url_parse(const char *url, url_parts_t *parts)
{
    char *p;
    memset(parts, 0, sizeof(url_parts_t));

    parts->protocol = (char *)url;
    p = parts->protocol;

    while (*p) {
        if (*p == ':' && p[1] == '/' && p[2] == '/') {
            parts->protocollen = (int)(p - parts->protocol);
            parts->user = &p[3];
            parts->host = parts->user;

            break;
        }
        p++;
    }

    if (!parts->protocollen) {
        return -1;
    }

    p = parts->user;
    while (*p) {
        if (*p == '@') {
            parts->host = &p[1];
            p = parts->user;

            while (p < parts->host) {
                if (*p == ':') {
                    parts->password = &p[1];
                    parts->passwordlen = (int)(parts->host - parts->password - 1);
                    break;
                } else {
                    parts->userlen++;
                }
                p++;
            }

            break;
        }
        p++;
    }

    p = parts->host;
    while (*p) {
        if (*p == '/') {
            parts->path = p;
            p = parts->host;
            while (p < parts->path) {
                if (*p == ':') {
                    parts->port = &p[1];
                    parts->portlen = (int)(parts->path - parts->port);
                    break;
                } else {
                    parts->hostlen++;
                }
                p++;
            }
            break;
        }
        p++;
    }

    if (!parts->hostlen) {
        parts->hostlen = (int)(p - parts->host);

        return parts->hostlen? 0 : -1;
    }

    p = parts->path;
    while (*p++) {
        if (*p == '/') {
            parts->file = &p[1];
            parts->pathlen = (int)(p - parts->path + 1);
        }
    }

    if (! parts->pathlen) {
        parts->file = parts->path;
        // assert(*parts->file == '/');
        parts->file++;
    }

    p = parts->file;
    while (*p) {
        if (*p == '?') {
            parts->query = &p[1];
            break;
        } else if (*p == '#') {
            parts->fragment = &p[1];
            break;
        } else {
            parts->filelen++;
        }
        p++;
    }

    if (!parts->query && !parts->fragment) {
        return 0;
    }

    p = parts->query;
    while(p && *p) {
        if (*p == '#') {
            parts->fragment = &p[1];
            break;
        }
        parts->querylen++;
        p++;
    }

    if (parts->fragment) {
        p = parts->fragment;
        while(*p++) {
            parts->fragmentlen++;
        }
    }

    return 0;
}


/**
 *  url_parts_t parts;
 *  if (url_parse("https://john:hello@www.aspxfans.com:8080/world/news/index.asp?boardID=5&ID=24618&page=1#name", &parts) == 0) {
 *      // parse success
 *      if (parts.querylen) {
 *          // "boardID=5&ID=24618&page=1"
 *          url_query_parse(parts.query, parts.querylen, url_param_print, NULL);
 *      }
 *  }
 */
static int url_query_parse(const char *query, int querylen, int (*url_param_cb)(int, void *, int, const char *, int, const char *), void *cbarg)
{
    int params = 0;

    char *p = (char *) query;
    const char *end = &query[querylen];

    char *name = p;
    int namelen = 0;

    char *value = NULL;
    int valuelen = 0;

    while (p != end) {
        if (*p == '=') {
            value = &p[1];
            valuelen = 0;
            namelen = (int)(value - name - 1);
        } else if (*p == '&') {
            if (value) {
                valuelen = (int)(p - value);
            }
            if (namelen && valuelen) {
                params++;

                if (! url_param_cb(params, cbarg, namelen, name, valuelen, value)) {
                    return params;
                }
            }

            name = &p[1];
            namelen = 0;

            value = NULL;
            valuelen = 0;
        }

        p++;
    }

    if (value) {
        valuelen = (int)(end - value);
    }

    if (namelen && valuelen) {
        params++;

        if (url_param_cb) {
            if (! url_param_cb(params, cbarg, namelen, name, valuelen, value)) {
                return params;
            }
        }
    }

    return params;
}


static void url_parts_print(const url_parts_t *parts)
{
    printf("url={%s}\\n", parts->protocol);
    printf(" .%-10s {%.*s}\\n", "protocol", parts->protocollen, parts->protocol);
    printf(" .%-10s {%.*s}\\n", "user", parts->userlen, parts->user);
    printf(" .%-10s {%.*s}\\n", "password", parts->passwordlen, parts->password);
    printf(" .%-10s {%.*s}\\n", "host", parts->hostlen, parts->host);
    printf(" .%-10s {%.*s}\\n", "port", parts->portlen, parts->port);
    printf(" .%-10s {%.*s}\\n", "path", parts->pathlen, parts->path);
    printf(" .%-10s {%.*s}\\n", "file", parts->filelen, parts->file);
    printf(" .%-10s {%.*s}\\n", "fragment", parts->fragmentlen, parts->fragment);
    printf(" .%-10s {%.*s}\\n", "query", parts->querylen, parts->query);

    if (parts->querylen) {
        url_query_parse(parts->query, parts->querylen, url_param_print, NULL);
    }
}

#ifdef __cplusplus
}
#endif

#endif /* URL_CODEC_INCLUDED */

以上是关于URL 解析极简版的主要内容,如果未能解决你的问题,请参考以下文章

URL 解析极简版

ATM管理系统(极简版)

极简版OKEX比特币跨期对冲策略

Golang 入门系列-八怎样实现定时任务,极简版.

极简版OpenGL 超级宝典(第五版)环境配置 VS2010

js消除小游戏(极简版)`