uriparser是一个用于解析与处理RFC3986标准URI的C语言库,这里简单记录其基本使用方式。

编译&安装

RPM包安装

centos7下可以通过yum安装。

只需要运行库的话可以安装uriparser
1
yum install uriparser
还需要头文件的话可以安装uriparser-devel
1
yum install uriparser-devel

源码编译

如果需要静态链接,可以考虑源码编译。

uriparser代码托管于github,可以直接下载release代码

比如不需要文档和测试,只编译静态链接库,并指定安装目录前缀
1
2
3
4
5
tar -xzvf uriparser-0.9.3.tar.gz && \
cd uriparser-0.9.3 && \
mkdir -p build && cd build && \
cmake -DURIPARSER_BUILD_TESTS=off -D URIPARSER_BUILD_DOCS=off -DBUILD_SHARED_LIBS=off -DCMAKE_INSTALL_PREFIX="$(INSTALL_DIR)" .. && \
make && make install

PS:编译需要cmake至少3.3版本,可以下载二进制安装包解压后将bin目录加入PATH环境变量直接使用,https://cmake.org/download/

例子

例子中可以看到,使用方式很简单,解析URI使用函数uriParseSingleUriA,如果解析成功,最后要调用uriFreeUriMembersA用以释放内存。
其他就是按照结构体构成读取解析后的各个URI结构。

main.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#include <stdio.h>
#include <stdlib.h>
#include <uriparser/Uri.h>
#include <arpa/inet.h>

char *print_text_range(char *buf, size_t buf_len, const char *first, const char *after_last) {
size_t len = after_last - first + 1 > buf_len ? buf_len : after_last - first + 1;
snprintf(buf, len, "%s", first);
return buf;
}

int main(int argc, char **argv) {
UriUriA uri;
UriPathSegmentA *path;

const char *error_pos;
char buf[256];
char ip_buf[16];

if (argc < 2) {
printf("Usage: %s URI\n", argv[0]);
return -1;
}

printf("uri for parsing: \"%s\"\n", argv[1]);

if (uriParseSingleUriA(&uri, argv[1], &error_pos) != URI_SUCCESS) {
printf("parse uri failed, error pointer \"%s\"\n", error_pos);
return -1;
}

printf("%-20s%s\n", "scheme:", uri.scheme.first ?
print_text_range(buf, sizeof(buf), uri.scheme.first, uri.scheme.afterLast) : NULL);
printf("%-20s%s\n", "userInfo:", uri.userInfo.first ?
print_text_range(buf, sizeof(buf), uri.userInfo.first, uri.userInfo.afterLast) : NULL);
printf("%-20s%s\n", "hostText:", uri.hostText.first ?
print_text_range(buf, sizeof(buf), uri.hostText.first, uri.hostText.afterLast) : NULL);
if (uri.hostData.ip4) {
printf("%-20s%s\n", "hostData.ip4",
inet_ntop(AF_INET, uri.hostData.ip4->data, ip_buf, sizeof(ip_buf)));
}
printf("%-20s%s\n", "portText:", uri.portText.first ?
print_text_range(buf, sizeof(buf), uri.portText.first, uri.portText.afterLast) : NULL);
printf("%-20s%s\n", "query:", uri.query.first ?
print_text_range(buf, sizeof(buf), uri.query.first, uri.query.afterLast) : NULL);
printf("%-20s%s\n", "fragment:", uri.fragment.first ?
print_text_range(buf, sizeof(buf), uri.fragment.first, uri.fragment.afterLast) : NULL);

for (path = uri.pathHead; path != NULL; path = path->next) {
printf("%-20s%s\n", "pathHead:",
print_text_range(buf, sizeof(buf), path->text.first, path->text.afterLast));
}

for (path = uri.pathTail; path != NULL; path = path->next) {
printf("%-20s%s\n", "pathTail:",
print_text_range(buf, sizeof(buf), path->text.first, path->text.afterLast));
}

uriFreeUriMembersA(&uri);

return 0;
}
编译
1
gcc -g main.c -l uriparser
运行输出
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
[huyu@localhost test_uriparser]$ ./a.out "tcp://user:pass@host:443/a/b/cd?k=v&k2=v2#frag"
uri for parsing: "tcp://user:pass@host:443/a/b/cd?k=v&k2=v2#frag"
scheme: tcp
userInfo: user:pass
hostText: host
portText: 443
query: k=v&k2=v2
fragment: frag
pathHead: a
pathHead: b
pathHead: cd
pathTail: cd
[huyu@localhost test_uriparser]$ ./a.out "tcp://user:pass@192.168.1.1:443/a/b/cd?k=v&k2=v2#frag"
uri for parsing: "tcp://user:pass@192.168.1.1:443/a/b/cd?k=v&k2=v2#frag"
scheme: tcp
userInfo: user:pass
hostText: 192.168.1.1
hostData.ip4 192.168.1.1
portText: 443
query: k=v&k2=v2
fragment: frag
pathHead: a
pathHead: b
pathHead: cd
pathTail: cd

结构体

uriparser的头文件中使用宏定义包装了结构体和函数的命名,宏如下。功能为字符串拼接,比如URI_TYPE(Uri)在展开后为UriUriA

结构体及函数命名宏
1
2
3
4
5
#undef URI_FUNC
#define URI_FUNC(x) uri##x##A

#undef URI_TYPE
#define URI_TYPE(x) Uri##x##A

了解过宏定义就可以看主要结构体了

UriUriA
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
/**
* Represents an RFC 3986 %URI.
* Missing components can be {NULL, NULL} ranges.
*
* @see uriFreeUriMembersA
* @see uriFreeUriMembersMmA
* @see UriParserStateA
* @since 0.3.0
*/
typedef struct URI_TYPE(UriStruct) {
URI_TYPE(TextRange) scheme; /**< Scheme (e.g. "http") */
URI_TYPE(TextRange) userInfo; /**< User info (e.g. "user:pass") */
URI_TYPE(TextRange) hostText; /**< Host text (set for all hosts, excluding square brackets) */
URI_TYPE(HostData) hostData; /**< Structured host type specific data */
URI_TYPE(TextRange) portText; /**< Port (e.g. "80") */
URI_TYPE(PathSegment) * pathHead; /**< Head of a linked list of path segments */
URI_TYPE(PathSegment) * pathTail; /**< Tail of the list behind pathHead */
URI_TYPE(TextRange) query; /**< Query without leading "?" */
URI_TYPE(TextRange) fragment; /**< Query without leading "#" */
UriBool absolutePath; /**< Absolute path flag, distincting "a" and "/a";
always <c>URI_FALSE</c> for URIs with host */
UriBool owner; /**< Memory owner flag */

void * reserved; /**< Reserved to the parser */
} URI_TYPE(Uri); /**< @copydoc UriUriStructA */

UriUriA中大量出现的成员类型URI_TYPE(TextRange)定义如下,读取方式参考例子代码。

URI_TYPE(TextRange)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
/**
* Specifies a range of characters within a string.
* The range includes all characters from <c>first</c>
* to one before <c>afterLast</c>. So if both are
* non-NULL the difference is the length of the text range.
*
* @see UriUriA
* @see UriPathSegmentA
* @see UriHostDataA
* @since 0.3.0
*/
typedef struct URI_TYPE(TextRangeStruct) {
const URI_CHAR * first; /**< Pointer to first character */
const URI_CHAR * afterLast; /**< Pointer to character after the last one still in */
} URI_TYPE(TextRange); /**< @copydoc UriTextRangeStructA */

UriUriAhostData成员在域名是合法的IPV4地址时,uriparser会帮助解析为网络序的IPV4地址值,参考例子代码对hostData的处理。

URI_TYPE(PathSegment)代表了路径的每一段,链表结构。

URI_TYPE(PathSegment)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
/**
* Represents a path segment within a %URI path.
* More precisely it is a node in a linked
* list of path segments.
*
* @see UriUriA
* @since 0.3.0
*/
typedef struct URI_TYPE(PathSegmentStruct) {
URI_TYPE(TextRange) text; /**< Path segment name */
struct URI_TYPE(PathSegmentStruct) * next; /**< Pointer to the next path segment in the list, can be NULL if last already */

void * reserved; /**< Reserved to the parser */
} URI_TYPE(PathSegment); /**< @copydoc UriPathSegmentStructA */