10 Appendix
10.1 Python code
10.1.4 Text scraper
118
seed_id = nodes_df_no_corpus[nodes_df_no_corpus['URL'] == seed_link]['Id'].values[0]
raw_base_outer = ast.literal_eval(log.split('\n\n')[4].strip('Raw base outer links:
'))
for link, weight in raw_base_outer.items():
if link not in no_http_corpus_link:
current_id = nodes_df_no_corpus[nodes_df_no_corpus['URL'] == link]['Id'].va lues[0]
edges_tuples.append((seed_id,current_id,'Directed',weight))
edges_df_no_corpus = pd.DataFrame(edges_tuples, columns =['Source', 'Target', 'Type','Weight']) nodes_df_no_corpus.to_csv(r'C:\Users\gebruiker\Documents\Thesis_2022\nodes_df_nocorpus.csv',ind ex=False)
edges_df_no_corpus.to_csv(r'C:\Users\gebruiker\Documents\Thesis_2022\edges_df_nocorpus.csv',ind ex=False)
nodes_df.to_csv(r'C:\Users\gebruiker\Documents\Thesis_2022\nodes_df.csv',index=False) edges_df.to_csv(r'C:\Users\gebruiker\Documents\Thesis_2022\edges_df.csv',index=False)
119
if seed_base_no_sub in working_link:
working_link_split = working_link.split('://') if seed_base_sub and (not working_link_sub):
if seed_base_sub == 'www':
sub_include_link = working_link_split[0]+'://'+seed_base_sub+'.'+working_link_s plit[1]
try:
requests.get(sub_include_link, headers={'User-Agent': 'Mozilla/5.0 (Macinto sh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/5 37.36'},verify=False)
return sub_include_link, seed_base except URLError:
temp_seed_base = seed_base_no_sub return working_link, temp_seed_base else:
return working_link, seed_base
elif (not seed_base_sub) and working_link_sub:
if working_link_sub == 'www':
sub_exclude_link = working_link_split[0]+'://'+working_link_split[1].replace(wo rking_link_sub+'.','')
try:
requests.get(sub_exclude_link, headers={'User-Agent': 'Mozilla/5.0'}) return sub_exclude_link, seed_base
except URLError:
temp_seed_base = working_link_sub+'.'+seed_base return working_link, temp_seed_base
else:
temp_seed_base = ' '+seed_base return working_link, temp_seed_base
elif seed_base_sub and working_link_sub and seed_base_sub != working_link_sub:
return working_link, seed_base
elif seed_base_sub and working_link_sub and seed_base_sub == working_link_sub:
return working_link, seed_base else:
return working_link, seed_base else:
return working_link, seed_base
def check_http_discrepancies(working_link,seed,seed_base):
if seed_base in working_link:
working_link_split = working_link.split('://') seed_split = seed.split('://')
if working_link_split[0] != seed_split[0]:
working_link_w_seed_http = seed_split[0]+'://'+ working_link_split[1]
try:
requests.get(working_link_w_seed_http, headers={'User-Agent': 'Mozilla/5.0 (Mac intosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safa ri/537.36'},verify=False)
return working_link_w_seed_http except URLerror:
return working_link else:
return working_link
def correct_capitalization_error(working_link):
url_parse = urllib.parse.urlparse(working_link)
isolated_link = working_link.replace(url_parse.scheme+'://','').replace(url_parse.path,'') corr_link = working_link.replace(isolated_link,isolated_link.lower())
return corr_link
def correct_slash_error(working_link,seed_base):
120
link_parse = urllib.parse.urlparse(working_link)
working_link_no_scheme = working_link.replace(link_parse.scheme+'://','') link_path = urllib.parse.urlparse(working_link).path
if link_parse.path == '':
return working_link + '/'
elif working_link_no_scheme.endswith('/') == False and working_link_no_scheme == seed_base.
strip('/'):
return working_link + '/' else:
return working_link
def dict_check_for_presence(value,_dict,key,current_link):
if not value in _dict[key].keys():
_dict[key][value] = []
_dict[key][value].append(current_link) else:
if current_link not in _dict[key][value]:
_dict[key][value].append(current_link) return _dict
def edit_seed(seed):
parse_seed = urllib.parse.urlparse(seed) seed_path = urllib.parse.urlparse(seed).path if seed_path != '/':
if not '.' in seed_path:
seed_base = urllib.parse.urlparse(seed).hostname+seed_path else:
seed_base = urllib.parse.urlparse(seed).hostname else:
seed_base = urllib.parse.urlparse(seed).hostname if seed_path == '':
seed = seed+"/"
return seed, seed_base,seed_path
def fix_http_error(link):
url_parse = urllib.parse.urlparse(link) url_scheme = url_parse.scheme
if url_scheme:
if 's' in url_scheme:
return 'https://'+ link.replace(url_scheme,'').strip(':/') else:
return 'http://'+ link.replace(url_scheme,'').strip(':/') else:
if 'https' in link[:link.rfind('/')]:
split_link = list(link) split_link.insert(5,':') split_link.insert(6,'/') split_link.insert(7,'/') return ''.join(split_link)
elif 'http' in link[:link.rfind('/')]:
split_link = list(link) split_link.insert(4,':') split_link.insert(5,'/') split_link.insert(6,'/') return ''.join(split_link) else:
return link
def scrape_links_and_SM (seed, step):
print('Seed: ',seed)
121
try:
requests.get(seed, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_
4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36'},verify=False) except:
print('Seed URL error') return None, None, None
custom_cache_extract = tldextract.TLDExtract(cache_dir=None)
filters_all = ['mailto:','MAILTO:','javascript:','@','share=','facebook.com/sharer','linked in.com/shareArticle','whatsapp.com/send?','twitter.com/intent/tweet']
filters_inner = ['/comment','/post-edit','/email-post','/search/','/search?','file://','rep lytocom',
'add-to-cart=','ddownload=','tmpl=component','print=1','/winkelwagen/', '/shoppingcart/','/shopping-cart/']
filters_files = ['.jpg','.jpeg','.gif','.png','.tiff','.tif','.eps', '.exe','.xlsx','.xls','.txt','.docx','.msi','.raw','.bmp', '.mid','.mp3','.mp4','.avi','.pdf','.zip','.doc','.7z' '.ico','.csv','.odt','.ods','.ppt','.pptx',
'.epub','.flac','.ogg','.wav','.mpeg','.mov','.wmv','.flv','.webm', '.mkv']
filters_files_upper = [file.upper() for file in filters_files]
filters_inner = filters_inner + filters_files + filters_files_upper seed, seed_base, seed_path = edit_seed(seed)
print('Seed base: ',seed_base,'\n')
seed_no_punct = urllib.parse.urlparse(seed).netloc.replace('.', '_') inner_links = []
outer_links = []
frame_page = False outer_links_base = {}
links_per_level = {}
links_per_level[0] = {}
links_per_level[0]['inner'] = [seed]
inner_links.append(seed) for i in range(step):
if i not in links_per_level.keys():
pass else:
if not 'inner' in links_per_level[i].keys():
pass else:
for current_url in links_per_level[i]['inner']:
if not (i+1) in links_per_level.keys():
links_per_level[i+1] = {}
try:
session = requests.Session()
response = session.get(current_url,headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36'},verify=False)
seed_soup = BeautifulSoup(response.content, 'html.parser',from_encoding
="iso-8859-1")
#print(seed_soup)
current_page_link_tags = []
for frame in seed_soup.find_all(['frame','iframe','FRAME','IFRAME']):
if frame.has_attr('src'):
current_page_link_tags.append((frame,'src')) for a in seed_soup.find_all(['a','A','area','AREA']):
if a.has_attr('href'):
current_page_link_tags.append((a,'href')) for tag in current_page_link_tags:
#print('current url: ', current_url) #print(tag)
if tag[0].has_attr('onclick'):
122
if 'http' in tag[0]['onclick'] and tag[0][tag[1]] not in tag[0]
['onclick']:
raw_link = tag[0]['onclick']
working_link = ''
#print('raw link: ',raw_link)
#if (raw_link.strip('/') not in [link.strip('/')for link in inner_links]) or (raw_link.strip('/') not in [link.strip('/')for link in outer_links]):
working_link = raw_link[:raw_link.find('\',')][raw_link[:ra w_link.find('\',')].find('http'):].strip(' ”\'')
#else:
# pass else:
raw_link = tag[0][tag[1]]
working_link = ''
#print('raw link: ',raw_link)
#if (raw_link.strip('/') not in [link.strip('/')for link in inner_links]) or (raw_link.strip('/') not in [link.strip('/')for link in outer_links]):
working_link = tag[0][tag[1]].strip(' ”\'') #else:
# pass else:
raw_link = tag[0][tag[1]]
working_link = ''
#print('raw link: ',raw_link)
#if (raw_link.strip('/') not in [link.strip('/')for link in inn er_links]) or (raw_link.strip('/') not in [link.strip('/')for link in outer_links]):
working_link = tag[0][tag[1]].strip(' ”\'') #else:
# pass if working_link:
if working_link not in ['','/',seed,'#']:
if not any(x in working_link for x in filters_all):
working_link = fix_http_error(working_link) if 'http' not in working_link:
working_link = urllib.parse.urljoin(current_url,wor king_link)
if (urllib.parse.urlparse(working_link).hostname and '.' in str(urllib.parse.urlparse(working_link).
hostname)
and custom_cache_extract(working_link).suffix ):
if '><' in working_link:
working_link = working_link.split('><')[0]
if 'target=' in working_link:
working_link = working_link[:working_link.find ('target=')]
if '#' in working_link:
working_link = working_link[:working_link.find ('#')]
working_link = correct_capitalization_error(working _link)
working_link, temp_seed_base = check_subdomain_disc repancies(working_link,seed_base,custom_cache_extract)
working_link = correct_slash_error(working_link,tem p_seed_base)
working_short = working_link[:working_link.rfind('/
')+1]
if '?' in working_short:
working_short = working_short[:working_short.rf ind('?')+1]
if temp_seed_base in working_short:
123
if not any(x in working_link for x in filters_i nner):
working_link = check_http_discrepancies(wor king_link,seed,temp_seed_base)
#print('working link: ', working_link, '\n') if working_link.strip('/') not in [link.str ip('/') for link in inner_links]:
links_per_level = dict_check_for_presen ce('inner',links_per_level,i+1,working_link)
inner_links.append(working_link) else:
#print('\n') pass
else:
#print('working link: ', working_link, '\n') linkparse = urllib.parse.urlparse(working_link) if linkparse.hostname:
if linkparse.hostname.replace('.html','').r eplace('.','').replace(':','').isnumeric() == False:
link_base = linkparse.hostname
if link_base not in outer_links_base.ke ys():
outer_links_base[link_base] = 1 else:
outer_links_base[link_base] += 1 if working_link.strip('/') not in [link.
strip('/') for link in outer_links]:
links_per_level = dict_check_for_pr esence('outer',links_per_level,i+1,working_link)
outer_links.append(working_link)
else:
#print('\n') pass
else:
#print('\n') pass
else:
#print('\n') pass
except:
pass if seed_path != '/' and seed_path != '':
log_name = 'Random_link_logs/'+seed_no_punct+'_'+seed_path.strip('/').replace('/','_')+
'_linklog.log' else:
log_name = 'Thesis_2022/Random_link_logs/'+seed_no_punct+'_linklog.log' pretty_dict_str = pp.pformat(links_per_level)
links_per_level = {k: v for k, v in links_per_level.items() if v}
return links_per_level, outer_links_base, log_name reli_pd = pd.read_excel(r'Websites_corpus.xlsx')[:655]
reli_pd['Category secondary + tetriary'] = [t.replace('.','') for t in reli_pd['Category second ary + tetriary']]
cats=list(reli_pd['Category secondary + tetriary'])
#currentpos=0
for t in cats.copy():
if ';' in t:
split = t.split(';') cats.remove(t) for s in split:
124
cats.append(s)
# currentpos+=1
categories_secondary = [cat.partition(';')[0] for cat in reli_pd['Category secondary + tetriary ']]
categories_tetriary = [cat.partition(';')[2] for cat in reli_pd['Category secondary + tetriary ']]
for idx, x in enumerate(categories_tetriary):
if x == '':
categories_tetriary[idx] = categories_secondary[idx]
christianities = ['Protestantism','Oriental Orthodox Churches','Roman Catholicism', 'Eastern-Orthodox churches']
categories_primary = [cat if cat not in christianities else 'Christianity' for cat in categorie s_secondary]
reli_pd.insert(19, "Categories primary", categories_primary, True) reli_pd.insert(20, "Categories secondary", categories_secondary, True) reli_pd.insert(21, "Categories tetriary", categories_tetriary, True)
reli_pd_cats = reli_pd[['URL','C2: Stroming','Categories primary','Categories secondary','Categ ories tetriary','Language(s)']]
reli_pd_cats['URL'] = [url.strip('/') for url in reli_pd_cats['URL']]
primary_language = [lang.split(',')[0] for lang in reli_pd_cats['Language(s)']]
reli_pd_cats.insert(1, "Primary language", primary_language, True) reli_pd_cats.drop('Language(s)', axis=1,inplace=True)
logs = glob.glob(r'Link_logs\*.log')
URL_list_scraped = []
raw_links_count_list = []
raw_base_outer_list = []
first_level_list = []
seed_list = []
for log in logs:
with open(log,encoding='latin-1') as infile:
log = infile.read() current_level = []
raw_links_count = ast.literal_eval(log.split('\n\n')[3].strip('Raw links counts: ')) raw_base_outer = ast.literal_eval(log.split('\n\n')[4].strip('Raw base outer links: ')) raw_link_level = ast.literal_eval(log.split('\n\n')[5].strip('Raw link levels: ')) seed_link = raw_link_level[0]['inner'][0].strip('/')
seed_list.append(seed_link)
URL_list_scraped.append(seed_link.strip('/'))
raw_links_count_list.append((seed_link,raw_links_count)) raw_base_outer_list.append((seed_link,raw_base_outer)) if 1 in raw_link_level.keys():
if 'inner' in raw_link_level[1].keys():
first_level = raw_link_level[1]['inner'] first_level.append(seed_link)
first_level_list.append(first_level) else:
first_level_list.append([seed_link]) else:
first_level_list.append([seed_link])
125
data = {'URL':seed_list,
'Base outer links':raw_base_outer_list, 'First level inner links':first_level_list}
level1_db = pd.DataFrame(data=data)
for link in reli_pd_cats['URL']:
if link not in URL_list_scraped:
reli_pd_cats = reli_pd_cats.drop(reli_pd_cats['URL'][reli_pd_cats['URL'] == link].index [0])
reli_pd_cats = reli_pd_cats.reset_index(drop=True)
Unique_inner_links = []
Unique_outer_links = []
Unique_base_outer_links = []
Total_outer_links = []
for link in raw_links_count_list:
Unique_inner_links.append(link[1]['Unique inner links:']) Unique_outer_links.append(link[1]['Unique outer links'])
Unique_base_outer_links.append(link[1]['Unique base outer links:']) Total_outer_links.append(link[1]['Total outer links'])
data = {'URL':URL_list_scraped,
'Unique inner links':Unique_inner_links, 'Unique outer links':Unique_outer_links,
'Unique base outer links':Unique_base_outer_links, 'Total outer links':Total_outer_links}
link_counts_df = pd.DataFrame(data=data)
link_counts_cats_df = link_counts_df.merge(reli_pd_cats, how='inner', on='URL') link_counts_cats_df = link_counts_cats_df.merge(level1_db, how='inner', on='URL')
#Need to keep this because of bug
link_counts_cats_df.drop(link_counts_cats_df.index[(link_counts_cats_df['Unique inner links'] =
= 0) & (link_counts_cats_df['Unique outer links'] < 5)], inplace=True) link_counts_cats_df = link_counts_cats_df.reset_index(drop=True)
domains = []
for url in link_counts_cats_df['URL']:
domains.append(tldextract.extract(url)[2]) base_outer_domains = []
for row in link_counts_cats_df['Base outer links']:
base_outer_domains.append([tldextract.extract(url)[2] for url in row[1].keys()])
link_counts_cats_df.insert(1, "URL domain", domains, True)
link_counts_cats_df.insert(11, "Base outer domains", base_outer_domains, True) link_counts_cats_df
filters_files = ['.jpg','.jpeg','.gif','.png','.tiff','.tif','.eps', '.exe','.xlsx','.xls','.txt','.docx','.msi','.raw','.bmp', '.mid','.mp3','.mp4','.avi','.pdf','.zip','.doc','.7z' '.ico','.csv','.odt','.ods','.ppt','.pptx',
'.epub','.flac','.ogg','.wav','.mpeg','.mov','.wmv','.flv','.webm', '.mkv']
with open(r'paragraphs.txt', 'w',encoding="utf-8") as fp:
row_paragraphs_list = []
for row in link_counts_cats_df['First level inner links']:
paragraphs_list = []
for url in row:
126
if not any(_format in url for _format in filters_files):
try:
session = requests.Session()
response = session.get(url,headers={'User-Agent': 'Mozilla/5.0 (Macintosh;
Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1 916.153 Safari/537.3 6'},verify=False)
print(url)
current_page_paras = []
seed_soup = BeautifulSoup(response.content, 'html.parser') for para in seed_soup.find_all("p"):
current_page_paras.append(para.get_text()) #final_page_para = '\n'.join(current_page_paras) paragraphs_list.append(current_page_paras) except:
print(url,' failed') continue
#row_paragraphs_list.append(paragraphs_list)
fp.write(url+'SPLITHEREFORURLANDPARAS'+str(paragraphs_list)+'PYTHONSPLITHEREPLEASETHANK YOU')
with open(r'paragraphs.txt',encoding="utf-8") as infile:
paragraphs = infile.read()
paragraphs_list=paragraphs.split('PYTHONSPLITHEREPLEASETHANKYOU')[:613]
paragraphs_list=[(par.split('SPLITHEREFORURLANDPARAS')[0],par.split('SPLITHEREFORURLANDPARAS') [1]) for par in paragraphs_list]
paragraphs_listoflists = [ast.literal_eval(para) for para in paragraphs_df['Paragraph']]
paragraphs_strings = []
for site in paragraphs_listoflists:
site_list = []
for page in site:
page_string = ' '.join(page) site_list.append(page_string) site_string = ' '.join(site_list) paragraphs_strings.append(site_string)
paragraphs_strings = [para.strip(' ') for para in paragraphs_strings]
para_languages = []
for site in paragraphs_strings:
try:
para_languages.append(detect(site)) except:
para_languages.append('')
paragraphs_df.drop('Paragraph', axis=1,inplace=True)
paragraphs_df.insert(1,"Paragraph lists", paragraphs_listoflists) paragraphs_df.insert(2,"Paragraph strings", paragraphs_strings) paragraphs_df.insert(3,"Paragraph main language", para_languages) paragraphs_df
link_counts_cats_df_updated = link_counts_cats_df.merge(paragraphs_df, how='inner', on='URL') link_counts_cats_df_updated
#Removing sites that upon manual inspection did not result anything of note
failed_sites = ['https://kerkdienstgemist.nl','http://tesfaberhaneca.com/EECUN','http://www.evp -voices.com','https://www.harmonia-nl.org','https://zinweb.nl']
dutch_links_df = link_counts_cats_df_updated.loc[link_counts_cats_df_updated['Paragraph main la nguage']=='nl'][['URL','C2: Stroming','Categories primary','Categories secondary',
127
'Categories tetriary','Paragraph lists','Paragraph strings']]
dutch_links_df = dutch_links_df.loc[~dutch_links_df['URL'].isin(failed_sites)]
dutch_links_df = dutch_links_df.reset_index(drop=True)
binary_categories_list = []
for val in dutch_links_df['Categories primary']:
if val == 'Christianity':
binary_categories_list.append('Christianity') else:
binary_categories_list.append('Not Christianity')
dutch_links_df.insert(1, "Categories Chr binary", binary_categories_list) dutch_links_df
page_size = []
av_par_lens = []
for row in dutch_links_df['Paragraph lists']:
page_size.append(len(row)) par_lens = []
for par in row:
par_lens.append(len(par))
av_par_lens.append(round((sum(par_lens)/len(par_lens))))
dutch_links_df.insert(8, "Number of pages", page_size)
dutch_links_df.insert(9, "Av number of paragraphs", av_par_lens) dutch_links_df
#Removing all Not Christianity rows which do have Christianity in their complete category list dutch_links_df = dutch_links_df.loc[(dutch_links_df['Categories Chr binary'] == 'Christianity')
|((dutch_links_df['Categories Chr binary'] == 'Not Christianity')&(~dutch_links_df['C2: Stromin g'].str.contains('Christendom')))]
dutch_links_df = dutch_links_df.reset_index(drop=True) dutch_links_df
dutch_links_df['Categories Chr binary'].value_counts() def no_http(url):
url=url.replace('http://','') url=url.replace('https://','') return url
nl_christian_outer_links = []
corpus_URLs = list(link_counts_cats_df['URL'].apply(no_http))
filter_platforms = ['www.facebook.com','www.instagram.com','www.youtube.com','soundcloud.com',' w.soundcloud.com','api.whatsapp.com',
'twitter.com','google.com','google.nl','youtu.be', 'vimeo.com','pinterest.co m', 'plus.google.com','www.marktplaats.nl']
for row in link_counts_cats_df.loc[link_counts_cats_df['Categories primary'] == 'Christianity']
['Base outer links']:
for site in list(row[1].keys()):
if (site not in corpus_URLs) and (site not in filter_platforms):
if site.endswith('.nl'):
if site not in nl_christian_outer_links:
nl_christian_outer_links.append(site) succesful_logs = 0
attempts = 0
while succesful_logs < 101:
random.seed(attempts)
random_URL_selection = random.choice(nl_christian_outer_links)
link_dict, base_outer_links, log_name = scrape_links_and_SM('http://'+random_URL_selection,
128
1)
if link_dict and base_outer_links and log_name:
if 'inner' in link_dict[1].keys():
if len(link_dict[1]['inner']) >= 5:
outerlinks_count = 0 outerlinks_total_count = 0
for k,v in base_outer_links.items():
outerlinks_count+=1
outerlinks_total_count +=v link_counts=[-1,0]
for key,value in link_dict.items():
if 'inner' in value.keys():
link_counts[0] += len(value['inner']) if 'outer' in value.keys():
link_counts[1] += len(value['outer'])
links_count_dict = {'Unique inner links:':link_counts[0],'Unique outer links':l ink_counts[1],
'Unique base outer links:':outerlinks_count,'Total outer li nks':outerlinks_total_count}
link_dict_print_op = pp.pformat(link_dict) print_split = link_dict_print_op.split(',') for s in print_split:
print_split[print_split.index(s)] = s.replace(' \'\n \'',' ') reconstr_print = ','.join(print_split)
with open(log_name, "w") as log_file:
pp.pprint(links_count_dict,log_file,sort_dicts=False,width=1) log_file.write('\n')
pp.pprint(base_outer_links, log_file,width=1) log_file.write('\n')
log_file.write(reconstr_print) log_file.write('\n\n')
log_file.write('Raw links counts: '+str(links_count_dict)) log_file.write('\n\n')
log_file.write('Raw base outer links: '+str(base_outer_links)) log_file.write('\n\n')
log_file.write('Raw link levels: '+str(link_dict)) succesful_logs += 1
attempts += 1 else:
print('http://'+random_URL_selection, ' is too small.') attempts += 1
else:
print('http://'+random_URL_selection, ' has no inner links.') attempts += 1
else:
print('http://'+random_URL_selection, ' gives error.') attempts += 1
logs_random_christ = glob.glob(r'Random_link_logs\*.log')
URL_list_scraped = []
raw_links_count_list = []
raw_base_outer_list = []
first_level_list = []
seed_list = []
for log in logs_random_christ:
with open(log,encoding='latin-1') as infile:
log = infile.read() current_level = []
129
raw_links_count = ast.literal_eval(log.split('\n\n')[3].strip('Raw links counts: ')) raw_base_outer = ast.literal_eval(log.split('\n\n')[4].strip('Raw base outer links: ')) raw_link_level = ast.literal_eval(log.split('\n\n')[5].strip('Raw link levels: ')) seed_link = raw_link_level[0]['inner'][0].strip('/')
seed_list.append(seed_link)
URL_list_scraped.append(seed_link.strip('/'))
raw_links_count_list.append((seed_link,raw_links_count)) raw_base_outer_list.append((seed_link,raw_base_outer)) if 1 in raw_link_level.keys():
if 'inner' in raw_link_level[1].keys():
first_level = raw_link_level[1]['inner'] first_level.append(seed_link)
first_level_list.append(first_level) else:
first_level_list.append([seed_link]) else:
first_level_list.append([seed_link])
data = {'URL':seed_list,
'Base outer links':raw_base_outer_list, 'First level inner links':first_level_list}
random_level1_db = pd.DataFrame(data=data) random_level1_db
with open(r'C:\Users\Frank\Documents\Thesis_2022\paragraphs_random_100.txt',encoding="utf-8") a s infile:
paragraphs = infile.read()
paragraphs_random_list=paragraphs.split('PYTHONSPLITHEREPLEASETHANKYOU')[:100]
paragraphs_random_list=[(par.split('SPLITHEREFORURLANDPARAS')[0],par.split('SPLITHEREFORURLANDP ARAS')[1]) for par in paragraphs_random_list]
paragraphs_random_df = pd.DataFrame(data=paragraphs_random_list,columns=['URL','Paragraph']) paragraphs_random_df
paragraphs_random_listoflists = [ast.literal_eval(para) for para in paragraphs_random_df['Parag raph']]
paragraphs_random_strings = []
for site in paragraphs_random_listoflists:
site_list = []
for page in site:
page_string = ' '.join(page) site_list.append(page_string) site_string = ' '.join(site_list)
paragraphs_random_strings.append(site_string)
paragraphs_random_strings = [para.strip(' ') for para in paragraphs_random_strings]
para_random_languages = []
for site in paragraphs_random_strings:
try:
para_random_languages.append(detect(site)) except:
para_random_languages.append('')
paragraphs_random_df.drop('Paragraph', axis=1,inplace=True)
paragraphs_random_df.insert(1,"Paragraph lists", paragraphs_random_listoflists) paragraphs_random_df.insert(2,"Paragraph strings", paragraphs_random_strings) paragraphs_random_df.insert(3,"Paragraph main language", para_random_languages) paragraphs_random_df
130
paragraphs_random_df_filtered=paragraphs_random_df.loc[(paragraphs_random_df['Paragraph strings '] != '')&(paragraphs_random_df['Paragraph main language'] == 'nl')]
#with open(r'C:\Users\Frank\Documents\Thesis_2022\random100_urls_labels.txt', 'w') as write_fil e:
# for url in paragraphs_random_df_filtered['URL']:
# write_file.write(url+',\n')
with open(r'random100_urls_labels.txt') as infile:
rows = infile.read().split('\n')[:-1]
rows = [row.split(',')[1] for row in rows]
rows = ['Christianity' if row=='1' else 'Not Christianity' for row in rows]
paragraphs_random_df_filtered.insert(4,'Label',rows)
paragraphs_random_df_filtered
paragraph_predictor = dutch_links_df['Paragraph strings']
category_response = dutch_links_df['Categories Chr binary']
with open(r'stopwords-nl.txt',encoding="utf-8") as infile:
dutch_stopwords = infile.read().split('\n')
metric_per_gram = []
folds = StratifiedKFold(n_splits=10,shuffle = True, random_state = 100) metrics=[]
for train_index, test_index in folds.split(paragraph_predictor, category_response):
#print("TRAIN:", train_index, "TEST:", test_index)
par_train, par_test = paragraph_predictor[train_index], paragraph_predictor[test_index]
cat_train, cat_test = category_response[train_index], category_response[test_index]
cnt_vectorizer_unigram = CountVectorizer(stop_words=dutch_stopwords,ngram_range=(1,2)) par_train_vectors = cnt_vectorizer_unigram.fit_transform(par_train)
par_test_vectors = cnt_vectorizer_unigram.transform(par_test) svc_init = SVC(random_state=42, kernel='linear')
svc_trained = svc_init.fit(par_train_vectors, cat_train) cat_test_predict = svc_trained.predict(par_test_vectors) #for cat1,cat2 in zip(cat_test,cat_test_predict):
#print(cat1,cat2,sep=' ')
#print(accuracy_score(cat_test, cat_test_predict))
metrics.append(accuracy_score(cat_test, cat_test_predict)) metrics = np.array(metrics)
print('Mean accuracy: ', np.mean(metrics, axis=0)) print('Std for accuracy: ', np.std(metrics, axis=0))
folds = StratifiedKFold(n_splits=10,shuffle = True, random_state = 100) metrics=[]
for train_index, test_index in folds.split(paragraph_predictor, category_response):
#print("TRAIN:", train_index, "TEST:", test_index)
par_train, par_test = paragraph_predictor[train_index], paragraph_predictor[test_index]
cat_train, cat_test = category_response[train_index], category_response[test_index]
tfidf_vectorizer_bigram = TfidfVectorizer(analyzer='word',stop_words=dutch_stopwords,ngram_
range=(1,2))
par_train_vectors = tfidf_vectorizer_bigram.fit_transform(par_train) par_test_vectors = tfidf_vectorizer_bigram.transform(par_test) svc_init = SVC(random_state=42, kernel='linear')
svc_trained = svc_init.fit(par_train_vectors, cat_train) cat_test_predict = svc_trained.predict(par_test_vectors) #for cat1,cat2 in zip(cat_test,cat_test_predict):
#print(cat1,cat2,sep=' ')
print(accuracy_score(cat_test, cat_test_predict))
metrics.append(accuracy_score(cat_test, cat_test_predict))
131
metrics = np.array(metrics)
print('Mean accuracy: ', np.mean(metrics, axis=0)) print('Std for accuracy: ', np.std(metrics, axis=0)) paragraph_predictor = dutch_links_df['Paragraph strings']
category_response = dutch_links_df['Categories Chr binary']
random_paragraph_predictor = paragraphs_random_df_filtered['Paragraph strings']
random_category_response = paragraphs_random_df_filtered['Label']
tfidf_vectorizer_bigram = TfidfVectorizer(analyzer='word',stop_words=dutch_stopwords,ngram_rang e=(1,2))
model_vectors = tfidf_vectorizer_bigram.fit_transform(paragraph_predictor) random_vectors = tfidf_vectorizer_bigram.transform(random_paragraph_predictor)
svc_init = SVC(random_state=42, kernel='linear')
svc_trained = svc_init.fit(model_vectors, category_response)
cat_test_predict = svc_trained.predict(random_vectors)
for cat1,cat2 in zip(random_category_response,cat_test_predict):
print(cat1,cat2,sep=' ')
print(classification_report(random_category_response, cat_test_predict))