c++ 高效解析url算法

Posted qianbo_insist

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了c++ 高效解析url算法相关的知识,希望对你有一定的参考价值。

协议解析url

用处

http协议,rtp协议,rtmp协议,rtsp中解析url,很多语言都有封装的解析URL的工具类库,在c++ 写的server的中如果需要解析url,需要写一个高效率的解析封装方法。这里使用c++,仅仅使用c++ STL 的string 类 以及c++的封装特性,解析尽量使用了c的方式,以便于改进。

比较和查找

在url中查找字符串,如?,&,等等

static inline int judge_equeal(const char *pos, const char *compare, size_t &clen)
{
	for (size_t i = 0; i < clen; i++)
	{
		if (pos[i] != compare[i])
			return -1;
	}
	return 0;
}
//这个函数返回位置长度
static inline int string_find(const char *u, const char *compare)
{
	size_t clen = strlen(compare);
	size_t ulen = strlen(u);
	if (clen > ulen)
		return -1;
	const char *pos = u;
	const char *posend = u + ulen - 1;
	for (; pos <= posend - clen; pos++)
	{
		if (judge_equeal(pos, compare, clen) == 0)
		{
			return (int)(pos - u);
			//return 0;
		}
	}
	return -1;
}
//这个函数返回字符串位置
//如 abcdef  def 
        |    | 
static inline const char* string_find_pos(const char *u, const char *compare)
{
	size_t clen = strlen(compare);
	size_t ulen = strlen(u);
	if (clen > ulen)
		return NULL;
	const char *pos = u;
	const char *posend = u + ulen - 1;
	for (; pos <= posend - clen; pos++)
	{
		if (judge_equeal(pos, compare, clen) == 0)
		{
			return pos;
			//return 0;
		}
	}
	return NULL;
}

数据结构定义

数据结构中分别为协议,主机字符串,主机端口号,以及uri,如果需要param参数,如
http://aaa.com:8080/?x=123
求取x的值,需要调用GetParam函数,具体请看后面main函数sample

typedef struct UrlParam
{
	string protocol;
	string host;
	unsigned short port = 80;
	string uri;
	void clear()
	{
		protocol.clear();
		host.clear();
		uri.clear();
		port = 80;
	}
}TUrlParam;

解析类封装

class TParseUrl
{
protected:
	static int parse_domain(const char *pos, const char *posend, TUrlParam & param)
	{
		int point = string_find(pos, ":");
		if(point>=0)
		{
			param.host = string(pos, point);
			pos += point + 1;
			string tmp = string(pos, posend - pos) ;
			if(IsNumber(tmp.c_str()))
				param.port = atoi(tmp.c_str());
			return 0;
		}
		return -1;
	}
	static bool IsNumber(const char * num)
	{
		int length = (int)strlen(num);
		for (int i = 0; i < length; i++)
		{
			if (i == 0 && (num[i] == '+' || num[i] == '-'))
			{
				if (length > 1)
					continue;
				return false;
			}
			if (!isdigit(num[i]))
				return false;
		}
		return true;
	}
public:
	TParseUrl(const char * url) {
		ParseUrl(url, v_param);
	}
	virtual ~TParseUrl() {};
	TUrlParam v_param;

#define POS_JUDGE if(pos>=posend) return -1
#define POS_JUDGE_OK if(pos>=posend) return 0
	static int ParseUrl(const char *url, TUrlParam &param)
	{
		//memset(&param, 0, sizeof(param));
		const char * posend = url + strlen(url) - 1;
		param.uri = url;
		const char * pos = url;
		int point = 0;
		if ((point = string_find(pos, "://")) >= 0)
		{
			param.protocol = string(url, point);
		}
		else
			return -1;
		pos += point + 3; //strlen("://")
		POS_JUDGE;
		if ((point = string_find(pos, "/")) >= 0)
		{
			param.host = string(pos, point);
			const char *end = pos + point;
			parse_domain(pos, end, param);
			param.uri = string(pos + point + 1);
		}
		else
		{
			//the left all is domain
			int hlen = (int)(posend - pos + 1);
			param.host = string(pos,hlen);
			const char *end = pos + hlen - 1;
			parse_domain(pos, end, param);
			param.uri = "/";
			return 0;
		}


	
		return 0;
	}

	string GetParam(const char *param)
	{
		int point = -1;
		const char *ustart = v_param.uri.c_str();

		const char * start = string_find_pos(ustart, "?");
		if (start != NULL)
		{
			++start;
			//?a=abc&b=ddd
			string par = param;
			par +="=";
			start = string_find_pos(start, par.c_str());
			if (start != NULL )
			{
				const char * j = start - 1;
				char c = *j;
				if (c == '&' || c == '?')
				{
					start += par.length();
					const char * end = string_find_pos(start, "&");
					if (end != NULL)
					{
						return string(start, end);
					}
					return string(start);
				}
			}
		}
		return "";
	}

	void SetUrl(const char *url)
	{
		v_param.clear();
		ParseUrl(url, v_param);
	}
};

调用

在GetParam的时候,如果没有

if (c == ‘&’ || c == ‘?’)

的判断是不行的,为了加快判决我们直接是模式匹配,但是有如 “abc=” 如果不小心把“c=” 的模式配置在url中查找到就去取值是不对的,判断前一个字节是?或者& 是明智的做法。

int main()
{
cout << "======================================================"<<endl;
	const char * url = "rtp://234.5.6.7:8000/live/1000/s1?a=abc&b=ddd";
	cout << url << endl;
	TParseUrl parse(url);
	cout << parse.v_param.protocol << endl;
	cout << parse.v_param.host<< endl;
	cout << parse.v_param.port << endl;
	cout << parse.v_param.uri << endl;
	cout<<  parse.GetParam("a")<<endl;
	cout << parse.GetParam("b") << endl;
	cout << endl << endl;

	cout << "======================================================"<<endl;
	url = "https://127.0.0.1:9001/abc/qianbo/ss?abc=qianbo&abcd=test";
	cout << url << endl;
	parse.SetUrl(url);
	cout << parse.v_param.protocol << endl;
	cout << parse.v_param.host << endl;
	cout << parse.v_param.port << endl;
	cout << parse.v_param.uri << endl;
	cout << parse.GetParam("c") << endl;
	cout << parse.GetParam("d") << endl;
	cout << parse.GetParam("abc") << endl;
	cout << parse.GetParam("abcd") << endl;

}

url解析输出

测试

花了一小段时间写这个解析,希望有使用的人提出bug,留言。

全部代码,复制可用

/*
Author:钱波
email: 418511899@qq.com
wei:   18091589062
func  :类
time:  2021年6月9日
*/
#pragma once
#include <stdio.h>
#include <stdlib.h>
#include <memory.h>
#include <string.h>
#include <string>
#include <iostream>

using namespace std;

static inline int judge_equeal(const char *pos, const char *compare, size_t &clen)
{
	for (size_t i = 0; i < clen; i++)
	{
		if (pos[i] != compare[i])
			return -1;
	}
	return 0;
}
static inline int string_find(const char *u, const char *compare)
{
	size_t clen = strlen(compare);
	size_t ulen = strlen(u);
	if (clen > ulen)
		return -1;
	const char *pos = u;
	const char *posend = u + ulen - 1;
	for (; pos <= posend - clen; pos++)
	{
		if (judge_equeal(pos, compare, clen) == 0)
		{
			return (int)(pos - u);
			//return 0;
		}
	}
	return -1;
}
static inline const char* string_find_pos(const char *u, const char *compare)
{
	size_t clen = strlen(compare);
	size_t ulen = strlen(u);
	if (clen > ulen)
		return NULL;
	const char *pos = u;
	const char *posend = u + ulen - 1;
	for (; pos <= posend - clen; pos++)
	{
		if (judge_equeal(pos, compare, clen) == 0)
		{
			return pos;
			//return 0;
		}
	}
	return NULL;
}
typedef struct UrlParam
{
	string protocol;
	string host;
	unsigned short port = 80;
	string uri;
	void clear()
	{
		protocol.clear();
		host.clear();
		uri.clear();
		port = 80;
	}
}TUrlParam;
class TParseUrl
{
protected:
	static int parse_domain(const char *pos, const char *posend, TUrlParam & param)
	{
		int point = string_find(pos, ":");
		if(point>=0)
		{
			param.host = string(pos, point);
			pos += point + 1;
			string tmp = string(pos, posend - pos) ;
			if(IsNumber(tmp.c_str()))
				param.port = atoi(tmp.c_str());
			return 0;
		}
		return -1;
	}
	static bool IsNumber(const char * num)
	{
		int length = (int)strlen(num);
		for (int i = 0; i < length; i++)
		{
			if (i == 0 && (num[i] == '+' || num[i] == '-'))
			{
				if (length > 1)
					continue;
				return false;
			}
			if (!isdigit(num[i]))
				return false;
		}
		return true;
	}
public:
	TParseUrl(const char * url) {
		ParseUrl(url, v_param);
	}
	virtual ~TParseUrl() {};
	TUrlParam v_param;

#define POS_JUDGE if(pos>=posend) return -1
#define POS_JUDGE_OK if(pos>=posend) return 0
	static int ParseUrl(const char *url, TUrlParam &param)
	{
		//memset(&param, 0, sizeof(param));
		const char * posend = url + strlen(url) - 1;
		param.uri = url;
		const char * pos = url;
		int point = 0;
		if ((point = string_find(pos, "://")) >= 0)
		{
			param.protocol = string(url, point);
		}
		else
			return -1;
		pos += point + 3; //strlen("://")
		POS_JUDGE;
		if ((point = string_find(pos, "/")) >= 0)
		{
			param.host = string(pos, point);
			const char *end = pos + point;
			parse_domain(pos, end, param);
			param.uri

以上是关于c++ 高效解析url算法的主要内容,如果未能解决你的问题,请参考以下文章

华为OD机试 - 热点网络统计(JavaScript) | 机试题+算法思路+考点+代码解析 2023

片段(Java) | 机试题+算法思路+考点+代码解析 2023

PHP常用代码片段

C++ 中的高效消息工厂和处理程序

C++异常处理:掌握高效健壮代码的秘密武器

用于在 C++ 中包装整数的干净、高效的算法