uriparser 解析处理URI

uriparser是一个用于解析与处理RFC3986标准URI的C语言库，这里简单记录其基本使用方式。

编译&安装

RPM包安装

centos7下可以通过yum安装。

只需要运行库的话可以安装uriparser

1	yum install uriparser

还需要头文件的话可以安装uriparser-devel

1	yum install uriparser-devel

源码编译

如果需要静态链接，可以考虑源码编译。

uriparser代码托管于github，可以直接下载release代码。

比如不需要文档和测试，只编译静态链接库，并指定安装目录前缀

tar -xzvf uriparser-0.9.3.tar.gz && \
        cd uriparser-0.9.3 && \
        mkdir -p build && cd build && \
        cmake -DURIPARSER_BUILD_TESTS=off -D URIPARSER_BUILD_DOCS=off -DBUILD_SHARED_LIBS=off -DCMAKE_INSTALL_PREFIX="$(INSTALL_DIR)" .. && \
        make && make install

PS：编译需要cmake至少3.3版本，可以下载二进制安装包解压后将bin目录加入PATH环境变量直接使用，https://cmake.org/download/

例子

例子中可以看到，使用方式很简单，解析URI使用函数uriParseSingleUriA，如果解析成功，最后要调用uriFreeUriMembersA用以释放内存。
其他就是按照结构体构成读取解析后的各个URI结构。

main.c

#include <stdio.h>
#include <stdlib.h>
#include <uriparser/Uri.h>
#include <arpa/inet.h>

char *print_text_range(char *buf, size_t buf_len, const char *first, const char *after_last) {
    size_t len = after_last - first + 1 > buf_len ? buf_len : after_last - first + 1;
    snprintf(buf, len, "%s", first);
    return buf;
}

int main(int argc, char **argv) {
    UriUriA             uri;
    UriPathSegmentA     *path;

    const char          *error_pos;
    char                buf[256];
    char                ip_buf[16];

    if (argc < 2) {
        printf("Usage: %s URI\n", argv[0]);
        return -1;
    }

    printf("uri for parsing: \"%s\"\n", argv[1]);

    if (uriParseSingleUriA(&uri, argv[1], &error_pos) != URI_SUCCESS) {
        printf("parse uri failed, error pointer \"%s\"\n", error_pos);
        return -1;
    }

    printf("%-20s%s\n", "scheme:", uri.scheme.first ?
            print_text_range(buf, sizeof(buf), uri.scheme.first, uri.scheme.afterLast) : NULL);
    printf("%-20s%s\n", "userInfo:", uri.userInfo.first ?
            print_text_range(buf, sizeof(buf), uri.userInfo.first, uri.userInfo.afterLast) : NULL);
    printf("%-20s%s\n", "hostText:", uri.hostText.first ?
            print_text_range(buf, sizeof(buf), uri.hostText.first, uri.hostText.afterLast) : NULL);
    if (uri.hostData.ip4) {
        printf("%-20s%s\n", "hostData.ip4",
                inet_ntop(AF_INET, uri.hostData.ip4->data, ip_buf, sizeof(ip_buf)));
    }
    printf("%-20s%s\n", "portText:", uri.portText.first ?
            print_text_range(buf, sizeof(buf), uri.portText.first, uri.portText.afterLast) : NULL);
    printf("%-20s%s\n", "query:", uri.query.first ?
            print_text_range(buf, sizeof(buf), uri.query.first, uri.query.afterLast) : NULL);
    printf("%-20s%s\n", "fragment:", uri.fragment.first ?
            print_text_range(buf, sizeof(buf), uri.fragment.first, uri.fragment.afterLast) : NULL);

    for (path = uri.pathHead; path != NULL; path = path->next) {
        printf("%-20s%s\n", "pathHead:",
                print_text_range(buf, sizeof(buf), path->text.first, path->text.afterLast));
    }

    for (path = uri.pathTail; path != NULL; path = path->next) {
        printf("%-20s%s\n", "pathTail:",
                print_text_range(buf, sizeof(buf), path->text.first, path->text.afterLast));
    }

    uriFreeUriMembersA(&uri);

    return 0;
}

编译

1	gcc -g main.c -l uriparser

运行输出

[huyu@localhost test_uriparser]$ ./a.out "tcp://user:pass@host:443/a/b/cd?k=v&k2=v2#frag"
uri for parsing: "tcp://user:pass@host:443/a/b/cd?k=v&k2=v2#frag"
scheme:             tcp
userInfo:           user:pass
hostText:           host
portText:           443
query:              k=v&k2=v2
fragment:           frag
pathHead:           a
pathHead:           b
pathHead:           cd
pathTail:           cd
[huyu@localhost test_uriparser]$ ./a.out "tcp://user:pass@192.168.1.1:443/a/b/cd?k=v&k2=v2#frag"
uri for parsing: "tcp://user:pass@192.168.1.1:443/a/b/cd?k=v&k2=v2#frag"
scheme:             tcp
userInfo:           user:pass
hostText:           192.168.1.1
hostData.ip4        192.168.1.1
portText:           443
query:              k=v&k2=v2
fragment:           frag
pathHead:           a
pathHead:           b
pathHead:           cd
pathTail:           cd

结构体

uriparser的头文件中使用宏定义包装了结构体和函数的命名，宏如下。功能为字符串拼接，比如URI_TYPE(Uri)在展开后为UriUriA

结构体及函数命名宏

#undef URI_FUNC
#define URI_FUNC(x) uri##x##A

#undef URI_TYPE
#define URI_TYPE(x) Uri##x##A

了解过宏定义就可以看主要结构体了

UriUriA

/**
 * Represents an RFC 3986 %URI.
 * Missing components can be {NULL, NULL} ranges.
 *
 * @see uriFreeUriMembersA
 * @see uriFreeUriMembersMmA
 * @see UriParserStateA
 * @since 0.3.0
 */
typedef struct URI_TYPE(UriStruct) {
    URI_TYPE(TextRange) scheme; /**< Scheme (e.g. "http") */
    URI_TYPE(TextRange) userInfo; /**< User info (e.g. "user:pass") */
    URI_TYPE(TextRange) hostText; /**< Host text (set for all hosts, excluding square brackets) */
    URI_TYPE(HostData) hostData; /**< Structured host type specific data */
    URI_TYPE(TextRange) portText; /**< Port (e.g. "80") */
    URI_TYPE(PathSegment) * pathHead; /**< Head of a linked list of path segments */
    URI_TYPE(PathSegment) * pathTail; /**< Tail of the list behind pathHead */
    URI_TYPE(TextRange) query; /**< Query without leading "?" */
    URI_TYPE(TextRange) fragment; /**< Query without leading "#" */
    UriBool absolutePath; /**< Absolute path flag, distincting "a" and "/a";
                            always <c>URI_FALSE</c> for URIs with host */
    UriBool owner; /**< Memory owner flag */

    void * reserved; /**< Reserved to the parser */
} URI_TYPE(Uri); /**< @copydoc UriUriStructA */

UriUriA中大量出现的成员类型URI_TYPE(TextRange)定义如下，读取方式参考例子代码。

URI_TYPE(TextRange)

/**
 * Specifies a range of characters within a string.
 * The range includes all characters from <c>first</c>
 * to one before <c>afterLast</c>. So if both are
 * non-NULL the difference is the length of the text range.
 *
 * @see UriUriA
 * @see UriPathSegmentA
 * @see UriHostDataA
 * @since 0.3.0
 */
typedef struct URI_TYPE(TextRangeStruct) {
    const URI_CHAR * first; /**< Pointer to first character */
    const URI_CHAR * afterLast; /**< Pointer to character after the last one still in */
} URI_TYPE(TextRange); /**< @copydoc UriTextRangeStructA */

UriUriA中hostData成员在域名是合法的IPV4地址时，uriparser会帮助解析为网络序的IPV4地址值，参考例子代码对hostData的处理。

URI_TYPE(PathSegment)代表了路径的每一段，链表结构。

URI_TYPE(PathSegment)

/**
 * Represents a path segment within a %URI path.
 * More precisely it is a node in a linked
 * list of path segments.
 *
 * @see UriUriA
 * @since 0.3.0
 */
typedef struct URI_TYPE(PathSegmentStruct) {
    URI_TYPE(TextRange) text; /**< Path segment name */
    struct URI_TYPE(PathSegmentStruct) * next; /**< Pointer to the next path segment in the list, can be NULL if last already */

    void * reserved; /**< Reserved to the parser */
} URI_TYPE(PathSegment); /**< @copydoc UriPathSegmentStructA */