• No results found

10 Appendix

10.1 Python code

10.1.2 Link crawler

102

#Frame for selection boxes

frame_selection_boxes = tk.Frame(master=frame_selection) frame_selection_boxes.pack(side='top',anchor='nw')

#Select?

frame_selected = tk.Frame(master=frame_selection_boxes) frame_selected.pack(side='left',padx=2)

lbl_selected = tk.Label(master=frame_selected, text='12. Include in collection',font=("Arial", 10),

padx=4) lbl_selected.pack(side='left') chk_selected_var = tk.BooleanVar()

chk_selected = tk.Checkbutton(master=frame_selected, variable=chk_selected_var,offvalue=False,o nvalue=True)

chk_selected.pack(side='left',padx=2)

#Noteworthy?

frame_noteworthy = tk.Frame(master=frame_selection_boxes) frame_noteworthy.pack(side='left',padx=2)

lbl_noteworthy = tk.Label(master=frame_noteworthy, text='13. Special interest',font=("Arial", 1 0),

padx=2) lbl_noteworthy.pack(side='left') chk_noteworthy_var = tk.BooleanVar()

chk_noteworthy = tk.Checkbutton(master=frame_noteworthy, variable=chk_noteworthy_var,offvalue=F alse,onvalue=True)

chk_noteworthy.pack(side='left',padx=2)

#Reason for selection

lbl_selection_reason = tk.Label(master=frame_selection, text='14. Reason for selection / reject ion',font=("Arial", 10),

padx=5, pady=2)

lbl_selection_reason.pack(side='top',anchor='nw')

text_selection_reason = tk.Text(master=frame_selection,width=41,height=3) text_selection_reason.pack(side='top',anchor='nw',padx=15, pady=2)

window.mainloop()

103

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) from collections import Counter

def check_subdomain_discrepancies(working_link,seed_base,cache_extract):

working_link_sub = cache_extract(working_link).subdomain seed_base_sub = cache_extract(seed_base).subdomain if seed_base_sub:

seed_base_no_sub = seed_base.replace(seed_base_sub+'.','') else:

seed_base_no_sub = seed_base if seed_base_no_sub in working_link:

working_link_split = working_link.split('://') if seed_base_sub and (not working_link_sub):

if seed_base_sub == 'www':

sub_include_link = working_link_split[0]+'://'+seed_base_sub+'.'+working_link_s plit[1]

try:

requests.get(sub_include_link, headers={'User-Agent': 'Mozilla/5.0 (Macinto sh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/5 37.36'},verify=False)

return sub_include_link, seed_base except URLError:

temp_seed_base = seed_base_no_sub return working_link, temp_seed_base else:

return working_link, seed_base

elif (not seed_base_sub) and working_link_sub:

if working_link_sub == 'www':

sub_exclude_link = working_link_split[0]+'://'+working_link_split[1].replace(wo rking_link_sub+'.','')

try:

requests.get(sub_exclude_link, headers={'User-Agent': 'Mozilla/5.0'}) return sub_exclude_link, seed_base

except URLError:

temp_seed_base = working_link_sub+'.'+seed_base return working_link, temp_seed_base

else:

temp_seed_base = ' '+seed_base return working_link, temp_seed_base

elif seed_base_sub and working_link_sub and seed_base_sub != working_link_sub:

return working_link, seed_base

elif seed_base_sub and working_link_sub and seed_base_sub == working_link_sub:

return working_link, seed_base else:

return working_link, seed_base else:

return working_link, seed_base

def check_http_discrepancies(working_link,seed,seed_base):

if seed_base in working_link:

working_link_split = working_link.split('://') seed_split = seed.split('://')

if working_link_split[0] != seed_split[0]:

working_link_w_seed_http = seed_split[0]+'://'+ working_link_split[1]

try:

requests.get(working_link_w_seed_http, headers={'User-Agent': 'Mozilla/5.0 (Mac intosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safa ri/537.36'},verify=False)

return working_link_w_seed_http except URLerror:

return working_link

104

else:

return working_link

def correct_capitalization_error(working_link):

url_parse = urllib.parse.urlparse(working_link)

isolated_link = working_link.replace(url_parse.scheme+'://','').replace(url_parse.path,'') corr_link = working_link.replace(isolated_link,isolated_link.lower())

return corr_link

def correct_slash_error(working_link,seed_base):

link_parse = urllib.parse.urlparse(working_link)

working_link_no_scheme = working_link.replace(link_parse.scheme+'://','') link_path = urllib.parse.urlparse(working_link).path

if link_parse.path == '':

return working_link + '/'

elif working_link_no_scheme.endswith('/') == False and working_link_no_scheme == seed_base.

strip('/'):

return working_link + '/' else:

return working_link

def dict_check_for_presence(value,_dict,key,current_link):

if not value in _dict[key].keys():

_dict[key][value] = []

_dict[key][value].append(current_link) else:

if current_link not in _dict[key][value]:

_dict[key][value].append(current_link) return _dict

def edit_seed(seed):

parse_seed = urllib.parse.urlparse(seed) seed_path = urllib.parse.urlparse(seed).path if seed_path != '/':

if not '.' in seed_path:

seed_base = urllib.parse.urlparse(seed).hostname+seed_path else:

seed_base = urllib.parse.urlparse(seed).hostname else:

seed_base = urllib.parse.urlparse(seed).hostname if seed_path == '':

seed = seed+"/"

return seed, seed_base,seed_path

def fix_http_error(link):

url_parse = urllib.parse.urlparse(link) url_scheme = url_parse.scheme

if url_scheme:

if 's' in url_scheme:

return 'https://'+ link.replace(url_scheme,'').strip(':/') else:

return 'http://'+ link.replace(url_scheme,'').strip(':/') else:

if 'https' in link[:link.rfind('/')]:

split_link = list(link) split_link.insert(5,':') split_link.insert(6,'/') split_link.insert(7,'/') return ''.join(split_link)

elif 'http' in link[:link.rfind('/')]:

105

split_link = list(link) split_link.insert(4,':') split_link.insert(5,'/') split_link.insert(6,'/') return ''.join(split_link) else:

return link

def scrape_links_and_SM (seed, step):

print('Seed: ',seed) try:

requests.get(seed, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_

4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36'},verify=False) except URLError:

print('Seed URL error') return None, None, None

custom_cache_extract = tldextract.TLDExtract(cache_dir=None)

filters_all = ['mailto:','MAILTO:','javascript:','@','share=','facebook.com/sharer','linked in.com/shareArticle',

'whatsapp.com/send?','wa.me/?','twitter.com/intent/tweet','twitter.com/share?

url=','pinterest.com/pin',

'reddit.com/submit','telegram.me/share']

filters_inner = ['/comment','/post-edit','/email-post','/search/','/search?','file://','rep lytocom',

'add-to-cart=','ddownload=','tmpl=component','print=1','/winkelwagen/', '/shoppingcart/','/shopping-cart/']

filters_files = ['.jpg','.jpeg','.gif','.png','.tiff','.tif','.eps', '.exe','.xlsx','.xls','.txt','.docx','.msi','.raw','.bmp', '.mid','.mp3','.mp4','.avi','.pdf','.zip','.doc','.7z' '.ico','.csv','.odt','.ods','.ppt','.pptx',

'.epub','.flac','.ogg','.wav','.mpeg','.mov','.wmv','.flv','.webm', '.mkv']

filters_files_upper = [file.upper() for file in filters_files]

filters_inner = filters_inner + filters_files + filters_files_upper seed, seed_base, seed_path = edit_seed(seed)

print('Seed base: ',seed_base,'\n')

seed_no_punct = urllib.parse.urlparse(seed).netloc.replace('.', '_') inner_links = []

outer_links = []

frame_page = False outer_links_base = {}

links_per_level = {}

links_per_level[0] = {}

links_per_level[0]['inner'] = [seed]

inner_links.append(seed) for i in range(step):

if i not in links_per_level.keys():

pass else:

if not 'inner' in links_per_level[i].keys():

pass else:

for current_url in links_per_level[i]['inner']:

if not (i+1) in links_per_level.keys():

links_per_level[i+1] = {}

try:

session = requests.Session()

response = session.get(current_url,headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36'},verify=False)

seed_soup = BeautifulSoup(response.content, 'html.parser',from_encoding

106

="iso-8859-1")

#print(seed_soup)

current_page_link_tags = []

for frame in seed_soup.find_all(['frame','iframe','FRAME','IFRAME']):

if frame.has_attr('src'):

current_page_link_tags.append((frame,'src')) for a in seed_soup.find_all(['a','A','area','AREA']):

if a.has_attr('href'):

current_page_link_tags.append((a,'href')) for tag in current_page_link_tags:

#print('current url: ', current_url) #print(tag)

if tag[0].has_attr('onclick'):

if 'http' in tag[0]['onclick'] and tag[0][tag[1]] not in tag[0]

['onclick']:

raw_link = tag[0]['onclick']

working_link = ''

#print('raw link: ',raw_link)

#if (raw_link.strip('/') not in [link.strip('/')for link in inner_links]) or (raw_link.strip('/') not in [link.strip('/')for link in outer_links]):

working_link = raw_link[:raw_link.find('\',')][raw_link[:ra w_link.find('\',')].find('http'):].strip(' ”\'')

#else:

# pass else:

raw_link = tag[0][tag[1]]

working_link = ''

#print('raw link: ',raw_link)

#if (raw_link.strip('/') not in [link.strip('/')for link in inner_links]) or (raw_link.strip('/') not in [link.strip('/')for link in outer_links]):

working_link = tag[0][tag[1]].strip(' ”\'') #else:

# pass else:

raw_link = tag[0][tag[1]]

working_link = ''

#print('raw link: ',raw_link)

#if (raw_link.strip('/') not in [link.strip('/')for link in inn er_links]) or (raw_link.strip('/') not in [link.strip('/')for link in outer_links]):

working_link = tag[0][tag[1]].strip(' ”\'') #else:

# pass if working_link:

if working_link not in ['','/',seed,'#']:

if not any(x in working_link for x in filters_all):

working_link = fix_http_error(working_link) if 'http' not in working_link:

working_link = urllib.parse.urljoin(current_url,wor king_link)

if (urllib.parse.urlparse(working_link).hostname and '.' in str(urllib.parse.urlparse(working_link).

hostname)

and custom_cache_extract(working_link).suffix ):

if '><' in working_link:

working_link = working_link.split('><')[0]

if 'target=' in working_link:

working_link = working_link[:working_link.find ('target=')]

if '#' in working_link:

working_link = working_link[:working_link.find

107

('#')]

working_link = correct_capitalization_error(working _link)

working_link, temp_seed_base = check_subdomain_disc repancies(working_link,seed_base,custom_cache_extract)

working_link = correct_slash_error(working_link,tem p_seed_base)

working_short = working_link[:working_link.rfind('/

')+1]

if '?' in working_short:

working_short = working_short[:working_short.rf ind('?')+1]

if temp_seed_base in working_short:

if not any(x in working_link for x in filters_i nner):

working_link = check_http_discrepancies(wor king_link,seed,temp_seed_base)

#print('working link: ', working_link, '\n') if working_link.strip('/') not in [link.str ip('/') for link in inner_links]:

links_per_level = dict_check_for_presen ce('inner',links_per_level,i+1,working_link)

inner_links.append(working_link) else:

#print('\n') pass

else:

#print('working link: ', working_link, '\n') linkparse = urllib.parse.urlparse(working_link) if linkparse.hostname:

if linkparse.hostname.replace('.html','').r eplace('.','').replace(':','').isnumeric() == False:

link_base = linkparse.hostname

if link_base not in outer_links_base.ke ys():

outer_links_base[link_base] = 1 else:

outer_links_base[link_base] += 1 if working_link.strip('/') not in [link.

strip('/') for link in outer_links]:

links_per_level = dict_check_for_pr esence('outer',links_per_level,i+1,working_link)

outer_links.append(working_link)

else:

#print('\n') pass

else:

#print('\n') pass

else:

#print('\n') pass

except:

pass if seed_path != '/' and seed_path != '':

log_name = 'Link_logs/'+seed_no_punct+'_'+seed_path.strip('/').replace('/','_')+'_linkl og.log'

else:

log_name = 'Link_logs/'+seed_no_punct+'_linklog.log' pretty_dict_str = pp.pformat(links_per_level)

108

links_per_level = {k: v for k, v in links_per_level.items() if v}

return links_per_level, outer_links_base, log_name

for seed in logs:

link_dict, base_outer_links, log_name = scrape_links_and_SM(seed, 4) if link_dict and base_outer_links and log_name:

outerlinks_count = 0 outerlinks_total_count = 0

for k,v in base_outer_links.items():

outerlinks_count+=1

outerlinks_total_count +=v link_counts=[-1,0]

for key,value in link_dict.items():

if 'inner' in value.keys():

link_counts[0] += len(value['inner']) if 'outer' in value.keys():

link_counts[1] += len(value['outer'])

links_count_dict = {'Unique inner links:':link_counts[0],'Unique outer links':link_coun ts[1],

'Unique base outer links:':outerlinks_count,'Total outer links':out erlinks_total_count}

link_dict_print_op = pp.pformat(link_dict) print_split = link_dict_print_op.split(',') for s in print_split:

print_split[print_split.index(s)] = s.replace(' \'\n \'',' ') reconstr_print = ','.join(print_split)

with open(log_name, "w") as log_file:

pprint.pprint(links_count_dict,log_file,sort_dicts=False,width=1) log_file.write('\n')

pprint.pprint(base_outer_links, log_file,width=1) log_file.write('\n')

log_file.write(reconstr_print) log_file.write('\n\n')

log_file.write('Raw links counts: '+str(links_count_dict)) log_file.write('\n\n')

log_file.write('Raw base outer links: '+str(base_outer_links)) log_file.write('\n\n')

log_file.write('Raw link levels: '+str(link_dict)) else:

print(seed, ' gives error.')