| | 243 | |
|---|
| | 244 | |
|---|
| | 245 | MAX_CACHE_SIZE = 0 |
|---|
| | 246 | _uri_parse_cache = {} |
|---|
| | 247 | def requesturi_split(uri): |
|---|
| | 248 | """Parse uri into (scheme, authority, path, query). |
|---|
| | 249 | |
|---|
| | 250 | <scheme>://<authority>/<path>?<query>#<fragment> |
|---|
| | 251 | """ |
|---|
| | 252 | # Relevant BNF: |
|---|
| | 253 | # Request-URI = "*" | absoluteURI | abs_path | authority |
|---|
| | 254 | # absoluteURI = scheme ":" ( hier_part | opaque_part ) |
|---|
| | 255 | # hier_part = ( net_path | abs_path ) [ "?" query ] |
|---|
| | 256 | # net_path = "//" authority [ abs_path ] |
|---|
| | 257 | # abs_path = "/" path_segments |
|---|
| | 258 | # path_segments = segment *( "/" segment ) |
|---|
| | 259 | # segment = *pchar *( ";" param ) |
|---|
| | 260 | |
|---|
| | 261 | # What to do with "//path"? It cannot be an absoluteURI since it |
|---|
| | 262 | # does not start with a "scheme:" component; it cannot be 'authority' |
|---|
| | 263 | # since it includes the reserved '/' character; therefore, it must |
|---|
| | 264 | # be an abs_path. Note especially that this has NOTHING to do with |
|---|
| | 265 | # the "Relative URI References" section of RFC 2396: |
|---|
| | 266 | # relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ] |
|---|
| | 267 | # That is, "//path" cannot be interpreted as a net_path: |
|---|
| | 268 | # net_path = "//" authority [ abs_path ] |
|---|
| | 269 | # Unfortunately, urlparse.urlsplit gets this wrong even when we |
|---|
| | 270 | # explicitly tell it we're passing an HTTP URI. So we use our own. |
|---|
| | 271 | |
|---|
| | 272 | # Neither the scheme, the authority, nor the path components |
|---|
| | 273 | # may contain "?", a reserved character. |
|---|
| | 274 | # According to RFC 2396, absoluteURI's might treat everything after the |
|---|
| | 275 | # "scheme:" as an opaque_part (= *uric, including "?"), but we read |
|---|
| | 276 | # RFC 2616 3.2.2 as implying HTTP URI's are always hierarchical. |
|---|
| | 277 | # Therefore, rather than trying to cache all possible URI's, we cache |
|---|
| | 278 | # only the part before the first "?". |
|---|
| | 279 | query = '' |
|---|
| | 280 | if '?' in uri: |
|---|
| | 281 | uri, query = uri.split('?', 1) |
|---|
| | 282 | |
|---|
| | 283 | key = uri |
|---|
| | 284 | cached = _uri_parse_cache.get(key, None) |
|---|
| | 285 | if cached: |
|---|
| | 286 | return cached + (query,) |
|---|
| | 287 | |
|---|
| | 288 | # uri may be an absoluteURI (including "http://host.domain.tld") |
|---|
| | 289 | scheme = authority = '' |
|---|
| | 290 | i = uri.find(':') |
|---|
| | 291 | if i > 0: |
|---|
| | 292 | scheme, uri = uri[:i].lower(), uri[i+1:] |
|---|
| | 293 | # Note the 'authority' component is only allowed when the URI |
|---|
| | 294 | # begins with an explicit "scheme:" |
|---|
| | 295 | if uri[:2] == '//': |
|---|
| | 296 | for c in '/?#': # the order is important! |
|---|
| | 297 | i = uri.find(c, 2) |
|---|
| | 298 | if i >= 0: |
|---|
| | 299 | authority, uri = uri[2:i], uri[i:] |
|---|
| | 300 | break |
|---|
| | 301 | else: |
|---|
| | 302 | authority, uri = uri[2:], '' |
|---|
| | 303 | |
|---|
| | 304 | if '#' in uri: |
|---|
| | 305 | raise ValueError("Illegal #fragment in Request-URI.") |
|---|
| | 306 | |
|---|
| | 307 | # avoid runaway growth |
|---|
| | 308 | if MAX_CACHE_SIZE and len(_uri_parse_cache) >= MAX_CACHE_SIZE: |
|---|
| | 309 | try: |
|---|
| | 310 | _uri_parse_cache.popitem() |
|---|
| | 311 | except KeyError: |
|---|
| | 312 | pass |
|---|
| | 313 | _uri_parse_cache[key] = v = (scheme, authority, uri) |
|---|
| | 314 | |
|---|
| | 315 | return v + (query,) |
|---|