10 Appendix
10.1 Python code
10.1.2 Link crawler
102
#Frame for selection boxes
frame_selection_boxes = tk.Frame(master=frame_selection) frame_selection_boxes.pack(side='top',anchor='nw')
#Select?
frame_selected = tk.Frame(master=frame_selection_boxes) frame_selected.pack(side='left',padx=2)
lbl_selected = tk.Label(master=frame_selected, text='12. Include in collection',font=("Arial", 10),
padx=4) lbl_selected.pack(side='left') chk_selected_var = tk.BooleanVar()
chk_selected = tk.Checkbutton(master=frame_selected, variable=chk_selected_var,offvalue=False,o nvalue=True)
chk_selected.pack(side='left',padx=2)
#Noteworthy?
frame_noteworthy = tk.Frame(master=frame_selection_boxes) frame_noteworthy.pack(side='left',padx=2)
lbl_noteworthy = tk.Label(master=frame_noteworthy, text='13. Special interest',font=("Arial", 1 0),
padx=2) lbl_noteworthy.pack(side='left') chk_noteworthy_var = tk.BooleanVar()
chk_noteworthy = tk.Checkbutton(master=frame_noteworthy, variable=chk_noteworthy_var,offvalue=F alse,onvalue=True)
chk_noteworthy.pack(side='left',padx=2)
#Reason for selection
lbl_selection_reason = tk.Label(master=frame_selection, text='14. Reason for selection / reject ion',font=("Arial", 10),
padx=5, pady=2)
lbl_selection_reason.pack(side='top',anchor='nw')
text_selection_reason = tk.Text(master=frame_selection,width=41,height=3) text_selection_reason.pack(side='top',anchor='nw',padx=15, pady=2)
window.mainloop()
103
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) from collections import Counter
def check_subdomain_discrepancies(working_link,seed_base,cache_extract):
working_link_sub = cache_extract(working_link).subdomain seed_base_sub = cache_extract(seed_base).subdomain if seed_base_sub:
seed_base_no_sub = seed_base.replace(seed_base_sub+'.','') else:
seed_base_no_sub = seed_base if seed_base_no_sub in working_link:
working_link_split = working_link.split('://') if seed_base_sub and (not working_link_sub):
if seed_base_sub == 'www':
sub_include_link = working_link_split[0]+'://'+seed_base_sub+'.'+working_link_s plit[1]
try:
requests.get(sub_include_link, headers={'User-Agent': 'Mozilla/5.0 (Macinto sh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/5 37.36'},verify=False)
return sub_include_link, seed_base except URLError:
temp_seed_base = seed_base_no_sub return working_link, temp_seed_base else:
return working_link, seed_base
elif (not seed_base_sub) and working_link_sub:
if working_link_sub == 'www':
sub_exclude_link = working_link_split[0]+'://'+working_link_split[1].replace(wo rking_link_sub+'.','')
try:
requests.get(sub_exclude_link, headers={'User-Agent': 'Mozilla/5.0'}) return sub_exclude_link, seed_base
except URLError:
temp_seed_base = working_link_sub+'.'+seed_base return working_link, temp_seed_base
else:
temp_seed_base = ' '+seed_base return working_link, temp_seed_base
elif seed_base_sub and working_link_sub and seed_base_sub != working_link_sub:
return working_link, seed_base
elif seed_base_sub and working_link_sub and seed_base_sub == working_link_sub:
return working_link, seed_base else:
return working_link, seed_base else:
return working_link, seed_base
def check_http_discrepancies(working_link,seed,seed_base):
if seed_base in working_link:
working_link_split = working_link.split('://') seed_split = seed.split('://')
if working_link_split[0] != seed_split[0]:
working_link_w_seed_http = seed_split[0]+'://'+ working_link_split[1]
try:
requests.get(working_link_w_seed_http, headers={'User-Agent': 'Mozilla/5.0 (Mac intosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safa ri/537.36'},verify=False)
return working_link_w_seed_http except URLerror:
return working_link
104
else:
return working_link
def correct_capitalization_error(working_link):
url_parse = urllib.parse.urlparse(working_link)
isolated_link = working_link.replace(url_parse.scheme+'://','').replace(url_parse.path,'') corr_link = working_link.replace(isolated_link,isolated_link.lower())
return corr_link
def correct_slash_error(working_link,seed_base):
link_parse = urllib.parse.urlparse(working_link)
working_link_no_scheme = working_link.replace(link_parse.scheme+'://','') link_path = urllib.parse.urlparse(working_link).path
if link_parse.path == '':
return working_link + '/'
elif working_link_no_scheme.endswith('/') == False and working_link_no_scheme == seed_base.
strip('/'):
return working_link + '/' else:
return working_link
def dict_check_for_presence(value,_dict,key,current_link):
if not value in _dict[key].keys():
_dict[key][value] = []
_dict[key][value].append(current_link) else:
if current_link not in _dict[key][value]:
_dict[key][value].append(current_link) return _dict
def edit_seed(seed):
parse_seed = urllib.parse.urlparse(seed) seed_path = urllib.parse.urlparse(seed).path if seed_path != '/':
if not '.' in seed_path:
seed_base = urllib.parse.urlparse(seed).hostname+seed_path else:
seed_base = urllib.parse.urlparse(seed).hostname else:
seed_base = urllib.parse.urlparse(seed).hostname if seed_path == '':
seed = seed+"/"
return seed, seed_base,seed_path
def fix_http_error(link):
url_parse = urllib.parse.urlparse(link) url_scheme = url_parse.scheme
if url_scheme:
if 's' in url_scheme:
return 'https://'+ link.replace(url_scheme,'').strip(':/') else:
return 'http://'+ link.replace(url_scheme,'').strip(':/') else:
if 'https' in link[:link.rfind('/')]:
split_link = list(link) split_link.insert(5,':') split_link.insert(6,'/') split_link.insert(7,'/') return ''.join(split_link)
elif 'http' in link[:link.rfind('/')]:
105
split_link = list(link) split_link.insert(4,':') split_link.insert(5,'/') split_link.insert(6,'/') return ''.join(split_link) else:
return link
def scrape_links_and_SM (seed, step):
print('Seed: ',seed) try:
requests.get(seed, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_
4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36'},verify=False) except URLError:
print('Seed URL error') return None, None, None
custom_cache_extract = tldextract.TLDExtract(cache_dir=None)
filters_all = ['mailto:','MAILTO:','javascript:','@','share=','facebook.com/sharer','linked in.com/shareArticle',
'whatsapp.com/send?','wa.me/?','twitter.com/intent/tweet','twitter.com/share?
url=','pinterest.com/pin',
'reddit.com/submit','telegram.me/share']
filters_inner = ['/comment','/post-edit','/email-post','/search/','/search?','file://','rep lytocom',
'add-to-cart=','ddownload=','tmpl=component','print=1','/winkelwagen/', '/shoppingcart/','/shopping-cart/']
filters_files = ['.jpg','.jpeg','.gif','.png','.tiff','.tif','.eps', '.exe','.xlsx','.xls','.txt','.docx','.msi','.raw','.bmp', '.mid','.mp3','.mp4','.avi','.pdf','.zip','.doc','.7z' '.ico','.csv','.odt','.ods','.ppt','.pptx',
'.epub','.flac','.ogg','.wav','.mpeg','.mov','.wmv','.flv','.webm', '.mkv']
filters_files_upper = [file.upper() for file in filters_files]
filters_inner = filters_inner + filters_files + filters_files_upper seed, seed_base, seed_path = edit_seed(seed)
print('Seed base: ',seed_base,'\n')
seed_no_punct = urllib.parse.urlparse(seed).netloc.replace('.', '_') inner_links = []
outer_links = []
frame_page = False outer_links_base = {}
links_per_level = {}
links_per_level[0] = {}
links_per_level[0]['inner'] = [seed]
inner_links.append(seed) for i in range(step):
if i not in links_per_level.keys():
pass else:
if not 'inner' in links_per_level[i].keys():
pass else:
for current_url in links_per_level[i]['inner']:
if not (i+1) in links_per_level.keys():
links_per_level[i+1] = {}
try:
session = requests.Session()
response = session.get(current_url,headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36'},verify=False)
seed_soup = BeautifulSoup(response.content, 'html.parser',from_encoding
106
="iso-8859-1")
#print(seed_soup)
current_page_link_tags = []
for frame in seed_soup.find_all(['frame','iframe','FRAME','IFRAME']):
if frame.has_attr('src'):
current_page_link_tags.append((frame,'src')) for a in seed_soup.find_all(['a','A','area','AREA']):
if a.has_attr('href'):
current_page_link_tags.append((a,'href')) for tag in current_page_link_tags:
#print('current url: ', current_url) #print(tag)
if tag[0].has_attr('onclick'):
if 'http' in tag[0]['onclick'] and tag[0][tag[1]] not in tag[0]
['onclick']:
raw_link = tag[0]['onclick']
working_link = ''
#print('raw link: ',raw_link)
#if (raw_link.strip('/') not in [link.strip('/')for link in inner_links]) or (raw_link.strip('/') not in [link.strip('/')for link in outer_links]):
working_link = raw_link[:raw_link.find('\',')][raw_link[:ra w_link.find('\',')].find('http'):].strip(' ”\'')
#else:
# pass else:
raw_link = tag[0][tag[1]]
working_link = ''
#print('raw link: ',raw_link)
#if (raw_link.strip('/') not in [link.strip('/')for link in inner_links]) or (raw_link.strip('/') not in [link.strip('/')for link in outer_links]):
working_link = tag[0][tag[1]].strip(' ”\'') #else:
# pass else:
raw_link = tag[0][tag[1]]
working_link = ''
#print('raw link: ',raw_link)
#if (raw_link.strip('/') not in [link.strip('/')for link in inn er_links]) or (raw_link.strip('/') not in [link.strip('/')for link in outer_links]):
working_link = tag[0][tag[1]].strip(' ”\'') #else:
# pass if working_link:
if working_link not in ['','/',seed,'#']:
if not any(x in working_link for x in filters_all):
working_link = fix_http_error(working_link) if 'http' not in working_link:
working_link = urllib.parse.urljoin(current_url,wor king_link)
if (urllib.parse.urlparse(working_link).hostname and '.' in str(urllib.parse.urlparse(working_link).
hostname)
and custom_cache_extract(working_link).suffix ):
if '><' in working_link:
working_link = working_link.split('><')[0]
if 'target=' in working_link:
working_link = working_link[:working_link.find ('target=')]
if '#' in working_link:
working_link = working_link[:working_link.find
107
('#')]
working_link = correct_capitalization_error(working _link)
working_link, temp_seed_base = check_subdomain_disc repancies(working_link,seed_base,custom_cache_extract)
working_link = correct_slash_error(working_link,tem p_seed_base)
working_short = working_link[:working_link.rfind('/
')+1]
if '?' in working_short:
working_short = working_short[:working_short.rf ind('?')+1]
if temp_seed_base in working_short:
if not any(x in working_link for x in filters_i nner):
working_link = check_http_discrepancies(wor king_link,seed,temp_seed_base)
#print('working link: ', working_link, '\n') if working_link.strip('/') not in [link.str ip('/') for link in inner_links]:
links_per_level = dict_check_for_presen ce('inner',links_per_level,i+1,working_link)
inner_links.append(working_link) else:
#print('\n') pass
else:
#print('working link: ', working_link, '\n') linkparse = urllib.parse.urlparse(working_link) if linkparse.hostname:
if linkparse.hostname.replace('.html','').r eplace('.','').replace(':','').isnumeric() == False:
link_base = linkparse.hostname
if link_base not in outer_links_base.ke ys():
outer_links_base[link_base] = 1 else:
outer_links_base[link_base] += 1 if working_link.strip('/') not in [link.
strip('/') for link in outer_links]:
links_per_level = dict_check_for_pr esence('outer',links_per_level,i+1,working_link)
outer_links.append(working_link)
else:
#print('\n') pass
else:
#print('\n') pass
else:
#print('\n') pass
except:
pass if seed_path != '/' and seed_path != '':
log_name = 'Link_logs/'+seed_no_punct+'_'+seed_path.strip('/').replace('/','_')+'_linkl og.log'
else:
log_name = 'Link_logs/'+seed_no_punct+'_linklog.log' pretty_dict_str = pp.pformat(links_per_level)
108
links_per_level = {k: v for k, v in links_per_level.items() if v}
return links_per_level, outer_links_base, log_name
for seed in logs:
link_dict, base_outer_links, log_name = scrape_links_and_SM(seed, 4) if link_dict and base_outer_links and log_name:
outerlinks_count = 0 outerlinks_total_count = 0
for k,v in base_outer_links.items():
outerlinks_count+=1
outerlinks_total_count +=v link_counts=[-1,0]
for key,value in link_dict.items():
if 'inner' in value.keys():
link_counts[0] += len(value['inner']) if 'outer' in value.keys():
link_counts[1] += len(value['outer'])
links_count_dict = {'Unique inner links:':link_counts[0],'Unique outer links':link_coun ts[1],
'Unique base outer links:':outerlinks_count,'Total outer links':out erlinks_total_count}
link_dict_print_op = pp.pformat(link_dict) print_split = link_dict_print_op.split(',') for s in print_split:
print_split[print_split.index(s)] = s.replace(' \'\n \'',' ') reconstr_print = ','.join(print_split)
with open(log_name, "w") as log_file:
pprint.pprint(links_count_dict,log_file,sort_dicts=False,width=1) log_file.write('\n')
pprint.pprint(base_outer_links, log_file,width=1) log_file.write('\n')
log_file.write(reconstr_print) log_file.write('\n\n')
log_file.write('Raw links counts: '+str(links_count_dict)) log_file.write('\n\n')
log_file.write('Raw base outer links: '+str(base_outer_links)) log_file.write('\n\n')
log_file.write('Raw link levels: '+str(link_dict)) else:
print(seed, ' gives error.')