diff options
author | Alec Murphy <alec@checksum.fail> | 2017-02-23 14:25:42 -0500 |
---|---|---|
committer | Alec Murphy <alec@checksum.fail> | 2017-02-23 14:25:42 -0500 |
commit | 00a26bf2641d927af577ca01ba6895236d466119 (patch) | |
tree | bf4e282568e2fa9fbb69e24ef0e1957157d8a8d4 | |
parent | 5fe4ab849c9ab86beae46c0a20707025b8316231 (diff) |
Fix relative path updating on 301/302.
-rw-r--r-- | uriel.py | 40 |
1 files changed, 39 insertions, 1 deletions
@@ -70,7 +70,9 @@ def UrielGetPage(): if url_comp.query != '': post_scheme += '?'+url_comp.query url = scheme + "://" + post_scheme - pagedata = subprocess.Popen('wget -q -O - -U "' + Uriel.user_agent + '" "' + url + '" 2>/dev/null', shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE).communicate()[0] + pagereq = subprocess.Popen('wget -O - -U "' + Uriel.user_agent + '" "' + url + '"', shell=True, stdin=subprocess.PIPE, stderr=subprocess.PIPE, stdout=subprocess.PIPE).communicate() + pagedata = pagereq[0] + pagehdrs = pagereq[1].split('\n') filedata = UrielPreProcess(pagedata, url) filesize = len(filedata) if filesize>0: @@ -97,6 +99,42 @@ def UrielGetPage(): os.write(HGBD,Uriel.download_buffer) logger.info("[Uriel] copy to download buffer " + url) else: + for p_hdr in pagehdrs: + if p_hdr.lower().find('location: ') != -1: + if p_hdr.lower().find('[following]') != -1: + url_comp = urlparse.urlparse(p_hdr[p_hdr.lower().find('location: ')+10:p_hdr.lower().find('[following]')].strip()) + scheme = '' + netloc = '' + path = '' + if url_comp.scheme == '': + scheme = Uriel.rel.scheme + else: + scheme = url_comp.scheme + Uriel.rel.scheme = url_comp.scheme + if url_comp.netloc == '': + netloc = Uriel.rel.netloc + else: + netloc = url_comp.netloc + Uriel.rel.netloc = url_comp.netloc + if url_comp.path != '': + if url_comp.path.find('/') != -1: + if url_comp.scheme == '' or url_comp.netloc == '': + if url_comp.path[:1] != '/': + path = Uriel.rel.path + url_comp.path + Uriel.rel.path += url_comp.path[:url_comp.path.rfind('/')+1] + else: + path = url_comp.path + Uriel.rel.path = url_comp.path[:url_comp.path.rfind('/')+1] + else: + path = url_comp.path + Uriel.rel.path = url_comp.path[:url_comp.path.rfind('/')+1] + else: + path = Uriel.rel.path + url_comp.path + post_scheme = netloc + "/" + urllib.quote(path) + post_scheme = post_scheme.replace('//','/') + if url_comp.query != '': + post_scheme += '?'+url_comp.query + url = scheme + "://" + post_scheme Uriel.nav_index += 1 Uriel.history = Uriel.history[0:Uriel.nav_index] Uriel.history.append({'url':url, 'filedata':filedata}) |