URL 解析极简版
Posted 车斗
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了URL 解析极简版相关的知识,希望对你有一定的参考价值。
URL 解析极简版
网上找了很多,要么太复杂,要么有BUG。我写了一个解析URL的代码,从头到尾不分配内存,只有一个头文件:urlcodec.h。
/**
* @file urlcodec.h
* @brief url encode and decode
* https://www.bejson.com/enc/urlencode/
* @author zhang
* @version
* @date 2021
* @note
* @since 2015
*/
#ifndef URL_CODEC_INCLUDED
#define URL_CODEC_INCLUDED
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
// #include <assert.h>
#ifdef __cplusplus
extern "C" {
#endif
static int url_param_print(int paramid, void *cbarg, int namelen, const char *name, int valuelen, const char *value)
{
printf(" :param%d {%.*s=%.*s}\\n", paramid, namelen, name, valuelen, value);
/* 0: break, 1: continue */
return 1;
}
/**
* const char testUrl[] = "https://cn.bing.com/search?q=news&form=QBLH&sp=-1&pq=news&sc=8-4&qs=n&sk=&cvid=AFDF6C399DF44C41977CB0ED4AE87B33";
*
* char encodedUrl[256];
* char decodedUrl[256];
*
* size_t outsize = url_encode(testUrl, encodedUrl, sizeof(encodedUrl));
* if (outsize < sizeof(encodedUrl)) {
* printf("success url_encode(): {%s}\\n", encodedUrl);
*
* outsize = url_decode(encodedUrl, decodedUrl, sizeof(decodedUrl));
* if (outsize < sizeof(decodedUrl)) {
* printf("success url_decode(): {%s}\\n", decodedUrl);
*
* if (memcmp(testUrl, decodedUrl, outsize)) {
* printf("fatal error: urlcodec.h has bugs!\\n");
* }
* } else {
* printf("failed url_decode: required outbuf at least: %d bytes\\n", (int)outsize);
* }
* } else {
* printf("failed url_encode: required outbuf at least: %d bytes\\n", (int)outsize);
* }
*/
static size_t url_encode(const char *url, char *outbuf, size_t bufsize)
{
static const char hex_char_table[] = "0123456789ABCDEF";
char *p = (char *) url;
char *buf = outbuf;
size_t blen = 0;
while (*p) {
if (isalnum(*p) || *p == '-' || *p == '_' || *p == '.' || *p == '~') {
blen++;
if ((size_t)(buf - outbuf) < bufsize) {
*buf++ = *p;
}
} else if (*p == ' ') {
blen++;
if ((size_t)(buf - outbuf) < bufsize) {
*buf++ = '+';
}
} else {
blen += 3;
if ((size_t)(buf - outbuf + 2) < bufsize) {
*buf++ = '%';
*buf++ = hex_char_table[(*p >> 4) & 15];
*buf++ = hex_char_table[(*p & 15) & 15];
}
}
p++;
}
blen++;
if ((size_t)(buf - outbuf) < bufsize) {
*buf = '\\0';
}
return blen;
}
static size_t url_decode(const char *encurl, char *outbuf, size_t bufsize)
{
#define url_decode_from_hex(ch) ((char)(isdigit(ch)? ((ch) - '0') : (tolower(ch) - 'a' + 10)))
char *p = (char *) encurl;
char *buf = outbuf;
size_t blen = 0;
while (*p) {
if (*p == '%') {
if (p[1] && p[2]) {
blen++;
if ((size_t)(buf - outbuf) < bufsize) {
*buf++ = url_decode_from_hex(p[1]) << 4 | url_decode_from_hex(p[2]);
}
p += 2;
}
} else if (*p == '+') {
blen++;
if ((size_t)(buf - outbuf) < bufsize) {
*buf++ = ' ';
}
} else {
blen++;
if ((size_t)(buf - outbuf) < bufsize) {
*buf++ = *p;
}
}
p++;
}
blen++;
if ((size_t)(buf - outbuf) < bufsize) {
*buf = '\\0';
}
return blen;
#undef url_decode_from_hex
}
/**
* protocol://user:password@host:port/directory/file.extension
* https://john:hello@www.aspxfans.com:8080/world/news/index.asp?boardID=5&ID=24618&page=1#name
*/
typedef struct {
char *protocol; /* [mandatory] https */
int protocollen; /* 5 */
char *user; /* <optional> john */
int userlen; /* 4 */
char *password; /* <optional> hello */
int passwordlen;
char *host; /* [mandatory] www.aspxfans.com */
int hostlen;
char *port; /* <optional> 8080 */
int portlen;
char *path; /* <optional> /world/news/ */
int pathlen;
char *file; /* <optional> index.asp */
int filelen;
char *query; /* <optional> boardID=5&ID=24618&page=1 */
int querylen;
char *fragment; /* <optional> name */
int fragmentlen;
} url_parts_t;
/**
* URL Specification: https://blog.csdn.net/fjb2080/article/details/80552213
*/
static int url_parse(const char *url, url_parts_t *parts)
{
char *p;
memset(parts, 0, sizeof(url_parts_t));
parts->protocol = (char *)url;
p = parts->protocol;
while (*p) {
if (*p == ':' && p[1] == '/' && p[2] == '/') {
parts->protocollen = (int)(p - parts->protocol);
parts->user = &p[3];
parts->host = parts->user;
break;
}
p++;
}
if (!parts->protocollen) {
return -1;
}
p = parts->user;
while (*p) {
if (*p == '@') {
parts->host = &p[1];
p = parts->user;
while (p < parts->host) {
if (*p == ':') {
parts->password = &p[1];
parts->passwordlen = (int)(parts->host - parts->password - 1);
break;
} else {
parts->userlen++;
}
p++;
}
break;
}
p++;
}
p = parts->host;
while (*p) {
if (*p == '/') {
parts->path = p;
p = parts->host;
while (p < parts->path) {
if (*p == ':') {
parts->port = &p[1];
parts->portlen = (int)(parts->path - parts->port);
break;
} else {
parts->hostlen++;
}
p++;
}
break;
}
p++;
}
if (!parts->hostlen) {
parts->hostlen = (int)(p - parts->host);
return parts->hostlen? 0 : -1;
}
p = parts->path;
while (*p++) {
if (*p == '/') {
parts->file = &p[1];
parts->pathlen = (int)(p - parts->path + 1);
}
}
if (! parts->pathlen) {
parts->file = parts->path;
// assert(*parts->file == '/');
parts->file++;
}
p = parts->file;
while (*p) {
if (*p == '?') {
parts->query = &p[1];
break;
} else if (*p == '#') {
parts->fragment = &p[1];
break;
} else {
parts->filelen++;
}
p++;
}
if (!parts->query && !parts->fragment) {
return 0;
}
p = parts->query;
while(p && *p) {
if (*p == '#') {
parts->fragment = &p[1];
break;
}
parts->querylen++;
p++;
}
if (parts->fragment) {
p = parts->fragment;
while(*p++) {
parts->fragmentlen++;
}
}
return 0;
}
/**
* url_parts_t parts;
* if (url_parse("https://john:hello@www.aspxfans.com:8080/world/news/index.asp?boardID=5&ID=24618&page=1#name", &parts) == 0) {
* // parse success
* if (parts.querylen) {
* // "boardID=5&ID=24618&page=1"
* url_query_parse(parts.query, parts.querylen, url_param_print, NULL);
* }
* }
*/
static int url_query_parse(const char *query, int querylen, int (*url_param_cb)(int, void *, int, const char *, int, const char *), void *cbarg)
{
int params = 0;
char *p = (char *) query;
const char *end = &query[querylen];
char *name = p;
int namelen = 0;
char *value = NULL;
int valuelen = 0;
while (p != end) {
if (*p == '=') {
value = &p[1];
valuelen = 0;
namelen = (int)(value - name - 1);
} else if (*p == '&') {
if (value) {
valuelen = (int)(p - value);
}
if (namelen && valuelen) {
params++;
if (! url_param_cb(params, cbarg, namelen, name, valuelen, value)) {
return params;
}
}
name = &p[1];
namelen = 0;
value = NULL;
valuelen = 0;
}
p++;
}
if (value) {
valuelen = (int)(end - value);
}
if (namelen && valuelen) {
params++;
if (url_param_cb) {
if (! url_param_cb(params, cbarg, namelen, name, valuelen, value)) {
return params;
}
}
}
return params;
}
static void url_parts_print(const url_parts_t *parts)
{
printf("url={%s}\\n", parts->protocol);
printf(" .%-10s {%.*s}\\n", "protocol", parts->protocollen, parts->protocol);
printf(" .%-10s {%.*s}\\n", "user", parts->userlen, parts->user);
printf(" .%-10s {%.*s}\\n", "password", parts->passwordlen, parts->password);
printf(" .%-10s {%.*s}\\n", "host", parts->hostlen, parts->host);
printf(" .%-10s {%.*s}\\n", "port", parts->portlen, parts->port);
printf(" .%-10s {%.*s}\\n", "path", parts->pathlen, parts->path);
printf(" .%-10s {%.*s}\\n", "file", parts->filelen, parts->file);
printf(" .%-10s {%.*s}\\n", "fragment", parts->fragmentlen, parts->fragment);
printf(" .%-10s {%.*s}\\n", "query", parts->querylen, parts->query);
if (parts->querylen) {
url_query_parse(parts->query, parts->querylen, url_param_print, NULL);
}
}
#ifdef __cplusplus
}
#endif
#endif /* URL_CODEC_INCLUDED */
以上是关于URL 解析极简版的主要内容,如果未能解决你的问题,请参考以下文章