URL 解析极简版

Posted 车斗

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了URL 解析极简版相关的知识,希望对你有一定的参考价值。

URL 解析极简版

网上找了很多,要么太复杂,要么有BUG。我写了一个解析URL的代码,从头到尾不分配内存,只有一个头文件:urlcodec.h。

/**
* @file urlcodec.h
* @brief url encode and decode
*    https://www.bejson.com/enc/urlencode/
* @author zhang
* @version
* @date 2021
* @note
* @since 2015
*/
#ifndef URL_CODEC_INCLUDED
#define URL_CODEC_INCLUDED

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
// #include <assert.h>

#ifdef __cplusplus
extern "C" 
#endif

static int url_param_print(int paramid, void *cbarg, int namelen, const char *name, int valuelen, const char *value)

    printf("  :param%d %.*s=%.*s\\n", paramid, namelen, name, valuelen, value);

    /* 0: break,  1: continue */
    return 1;



/**
 *  const char testUrl[] = "https://cn.bing.com/search?q=news&form=QBLH&sp=-1&pq=news&sc=8-4&qs=n&sk=&cvid=AFDF6C399DF44C41977CB0ED4AE87B33";
 *
 *  char encodedUrl[256];
 *  char decodedUrl[256];
 *
 *  size_t outsize = url_encode(testUrl, encodedUrl, sizeof(encodedUrl));
 *  if (outsize < sizeof(encodedUrl)) 
 *      printf("success url_encode(): %s\\n", encodedUrl);
 *
 *      outsize = url_decode(encodedUrl, decodedUrl, sizeof(decodedUrl));
 *      if (outsize < sizeof(decodedUrl)) 
 *          printf("success url_decode(): %s\\n", decodedUrl);
 *
 *          if (memcmp(testUrl, decodedUrl, outsize)) 
 *              printf("fatal error: urlcodec.h has bugs!\\n");
 *          
 *       else 
 *          printf("failed url_decode: required outbuf at least: %d bytes\\n", (int)outsize);
 *      
 *   else 
 *      printf("failed url_encode: required outbuf at least: %d bytes\\n", (int)outsize);
 *  
 */

static size_t url_encode(const char *url, char *outbuf, size_t bufsize)

    static const char hex_char_table[] = "0123456789ABCDEF";

    char *p = (char *) url;
    char *buf = outbuf;
    size_t blen = 0;

    while (*p) 
        if (isalnum(*p) || *p == '-' || *p == '_' || *p == '.' ||  *p == '~') 
            blen++;

            if ((size_t)(buf - outbuf) < bufsize) 
                *buf++ = *p;
            
         else if (*p == ' ') 
            blen++;

            if ((size_t)(buf - outbuf) < bufsize) 
                *buf++ = '+';
            
         else 
            blen += 3;

            if ((size_t)(buf - outbuf + 2) < bufsize) 
                *buf++ = '%';
                *buf++ = hex_char_table[(*p >> 4) & 15];
                *buf++ = hex_char_table[(*p & 15) & 15];
            
        
        p++;
    

    blen++;

    if ((size_t)(buf - outbuf) < bufsize) 
        *buf = '\\0';
    

    return blen;



static size_t url_decode(const char *encurl, char *outbuf, size_t bufsize)

#define url_decode_from_hex(ch)   ((char)(isdigit(ch)? ((ch) - '0') : (tolower(ch) - 'a' + 10)))
    char *p = (char *) encurl;
    char *buf = outbuf;
    size_t blen = 0;

    while (*p) 
        if (*p == '%') 
            if (p[1] && p[2]) 
                blen++;

                if ((size_t)(buf - outbuf) < bufsize) 
                    *buf++ = url_decode_from_hex(p[1]) << 4 | url_decode_from_hex(p[2]);
                

                p += 2;
            
         else if (*p == '+') 
            blen++;

            if ((size_t)(buf - outbuf) < bufsize) 
                *buf++ = ' ';
            
         else 
            blen++;

            if ((size_t)(buf - outbuf) < bufsize) 
                *buf++ = *p;
            
        
        p++;
    

    blen++;

    if ((size_t)(buf - outbuf) < bufsize) 
        *buf = '\\0';
    

    return blen;
#undef url_decode_from_hex



/**
 * protocol://user:password@host:port/directory/file.extension
 * https://john:hello@www.aspxfans.com:8080/world/news/index.asp?boardID=5&ID=24618&page=1#name
 */
typedef struct 
    char *protocol;    /* [mandatory] https */
    int protocollen;   /* 5 */

    char *user;        /* <optional> john */
    int userlen;       /* 4 */

    char *password;    /* <optional> hello */
    int passwordlen;

    char *host;        /* [mandatory] www.aspxfans.com */
    int hostlen;

    char *port;        /* <optional> 8080 */
    int portlen;

    char *path;        /* <optional> /world/news/  */
    int pathlen;

    char *file;        /* <optional> index.asp */
    int filelen;

    char *query;       /* <optional> boardID=5&ID=24618&page=1 */
    int querylen;

    char *fragment;    /* <optional> name */
    int fragmentlen;
 url_parts_t;


/**
 * URL Specification: https://blog.csdn.net/fjb2080/article/details/80552213
 */
static int url_parse(const char *url, url_parts_t *parts)

    char *p;
    memset(parts, 0, sizeof(url_parts_t));

    parts->protocol = (char *)url;
    p = parts->protocol;

    while (*p) 
        if (*p == ':' && p[1] == '/' && p[2] == '/') 
            parts->protocollen = (int)(p - parts->protocol);
            parts->user = &p[3];
            parts->host = parts->user;

            break;
        
        p++;
    

    if (!parts->protocollen) 
        return -1;
    

    p = parts->user;
    while (*p) 
        if (*p == '@') 
            parts->host = &p[1];
            p = parts->user;

            while (p < parts->host) 
                if (*p == ':') 
                    parts->password = &p[1];
                    parts->passwordlen = (int)(parts->host - parts->password - 1);
                    break;
                 else 
                    parts->userlen++;
                
                p++;
            

            break;
        
        p++;
    

    p = parts->host;
    while (*p) 
        if (*p == '/') 
            parts->path = p;
            p = parts->host;
            while (p < parts->path) 
                if (*p == ':') 
                    parts->port = &p[1];
                    parts->portlen = (int)(parts->path - parts->port);
                    break;
                 else 
                    parts->hostlen++;
                
                p++;
            
            break;
        
        p++;
    

    if (!parts->hostlen) 
        parts->hostlen = (int)(p - parts->host);

        return parts->hostlen? 0 : -1;
    

    p = parts->path;
    while (*p++) 
        if (*p == '/') 
            parts->file = &p[1];
            parts->pathlen = (int)(p - parts->path + 1);
        
    

    if (! parts->pathlen) 
        parts->file = parts->path;
        // assert(*parts->file == '/');
        parts->file++;
    

    p = parts->file;
    while (*p) 
        if (*p == '?') 
            parts->query = &p[1];
            break;
         else if (*p == '#') 
            parts->fragment = &p[1];
            break;
         else 
            parts->filelen++;
        
        p++;
    

    if (!parts->query && !parts->fragment) 
        return 0;
    

    p = parts->query;
    while(p && *p) 
        if (*p == '#') 
            parts->fragment = &p[1];
            break;
        
        parts->querylen++;
        p++;
    

    if (parts->fragment) 
        p = parts->fragment;
        while(*p++) 
            parts->fragmentlen++;
        
    

    return 0;



/**
 *  url_parts_t parts;
 *  if (url_parse("https://john:hello@www.aspxfans.com:8080/world/news/index.asp?boardID=5&ID=24618&page=1#name", &parts) == 0) 
 *      // parse success
 *      if (parts.querylen) 
 *          // "boardID=5&ID=24618&page=1"
 *          url_query_parse(parts.query, parts.querylen, url_param_print, NULL);
 *      
 *  
 */
static int url_query_parse(const char *query, int querylen, int (*url_param_cb)(int, void *, int, const char *, int, const char *), void *cbarg)

    int params = 0;

    char *p = (char *) query;
    const char *end = &query[querylen];

    char *name = p;
    int namelen = 0;

    char *value = NULL;
    int valuelen = 0;

    while (p != end) 
        if (*p == '=') 
            value = &p[1];
            valuelen = 0;
            namelen = (int)(value - name - 1);
         else if (*p == '&') 
            if (value) 
                valuelen = (int)(p - value);
            
            if (namelen && valuelen) 
                params++;

                if (! url_param_cb(params, cbarg, namelen, name, valuelen, value)) 
                    return params;
                
            

            name = &p[1];
            namelen = 0;

            value = NULL;
            valuelen = 0;
        

        p++;
    

    if (value) 
        valuelen = (int)(end - value);
    

    if (namelen && valuelen) 
        params++;

        if (url_param_cb) 
            if (! url_param_cb(params, cbarg, namelen, name, valuelen, value)) 
                return params;
            
        
    

    return params;



static void url_parts_print(const url_parts_t *parts)

    printf("url=%s\\n", parts->protocol);
    printf(" .%-10s %.*s\\n", "protocol", parts->protocollen, parts->protocol);
    printf(" .%-10s %.*s\\n", "user", parts->userlen, parts->user);
    printf(" .%-10s %.*s\\n", "password", parts->passwordlen, parts->password);
    printf(" .%-10s %.*s\\n", "host", parts->hostlen, parts->host);
    printf(" .%-10s %.*s\\n", "port", parts->portlen, parts->port);
    printf(" .%-10s %.*s\\n", "path", parts->pathlen, parts->path);
    printf(" .%-10s %.*s\\n", "file", parts->filelen, parts->file);
    printf(" .%-10s %.*s\\n", "fragment", parts->fragmentlen, parts->fragment);
    printf(" .%-10s %.*s\\n", "query", parts->querylen, parts->query);

    if (parts->querylen) 
        url_query_parse(parts->query, parts->querylen, url_param_print, NULL);
    


#ifdef __cplusplus

#endif

#endif /* URL_CODEC_INCLUDED */

以上是关于URL 解析极简版的主要内容,如果未能解决你的问题,请参考以下文章

URL 解析极简版

js消除小游戏(极简版)`

SimpleThreadPool极简版

cookie——登录注册极简版

Win极简版Plasm部署智能合约——搭建Plasm节点——2021.5.31

mysql 支持的存储引擎(极简版)