• No results found

10 Appendix

10.1 Python code

10.1.4 Text scraper

118

seed_id = nodes_df_no_corpus[nodes_df_no_corpus['URL'] == seed_link]['Id'].values[0]

raw_base_outer = ast.literal_eval(log.split('\n\n')[4].strip('Raw base outer links:

'))

for link, weight in raw_base_outer.items():

if link not in no_http_corpus_link:

current_id = nodes_df_no_corpus[nodes_df_no_corpus['URL'] == link]['Id'].va lues[0]

edges_tuples.append((seed_id,current_id,'Directed',weight))

edges_df_no_corpus = pd.DataFrame(edges_tuples, columns =['Source', 'Target', 'Type','Weight']) nodes_df_no_corpus.to_csv(r'C:\Users\gebruiker\Documents\Thesis_2022\nodes_df_nocorpus.csv',ind ex=False)

edges_df_no_corpus.to_csv(r'C:\Users\gebruiker\Documents\Thesis_2022\edges_df_nocorpus.csv',ind ex=False)

nodes_df.to_csv(r'C:\Users\gebruiker\Documents\Thesis_2022\nodes_df.csv',index=False) edges_df.to_csv(r'C:\Users\gebruiker\Documents\Thesis_2022\edges_df.csv',index=False)

119

if seed_base_no_sub in working_link:

working_link_split = working_link.split('://') if seed_base_sub and (not working_link_sub):

if seed_base_sub == 'www':

sub_include_link = working_link_split[0]+'://'+seed_base_sub+'.'+working_link_s plit[1]

try:

requests.get(sub_include_link, headers={'User-Agent': 'Mozilla/5.0 (Macinto sh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/5 37.36'},verify=False)

return sub_include_link, seed_base except URLError:

temp_seed_base = seed_base_no_sub return working_link, temp_seed_base else:

return working_link, seed_base

elif (not seed_base_sub) and working_link_sub:

if working_link_sub == 'www':

sub_exclude_link = working_link_split[0]+'://'+working_link_split[1].replace(wo rking_link_sub+'.','')

try:

requests.get(sub_exclude_link, headers={'User-Agent': 'Mozilla/5.0'}) return sub_exclude_link, seed_base

except URLError:

temp_seed_base = working_link_sub+'.'+seed_base return working_link, temp_seed_base

else:

temp_seed_base = ' '+seed_base return working_link, temp_seed_base

elif seed_base_sub and working_link_sub and seed_base_sub != working_link_sub:

return working_link, seed_base

elif seed_base_sub and working_link_sub and seed_base_sub == working_link_sub:

return working_link, seed_base else:

return working_link, seed_base else:

return working_link, seed_base

def check_http_discrepancies(working_link,seed,seed_base):

if seed_base in working_link:

working_link_split = working_link.split('://') seed_split = seed.split('://')

if working_link_split[0] != seed_split[0]:

working_link_w_seed_http = seed_split[0]+'://'+ working_link_split[1]

try:

requests.get(working_link_w_seed_http, headers={'User-Agent': 'Mozilla/5.0 (Mac intosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safa ri/537.36'},verify=False)

return working_link_w_seed_http except URLerror:

return working_link else:

return working_link

def correct_capitalization_error(working_link):

url_parse = urllib.parse.urlparse(working_link)

isolated_link = working_link.replace(url_parse.scheme+'://','').replace(url_parse.path,'') corr_link = working_link.replace(isolated_link,isolated_link.lower())

return corr_link

def correct_slash_error(working_link,seed_base):

120

link_parse = urllib.parse.urlparse(working_link)

working_link_no_scheme = working_link.replace(link_parse.scheme+'://','') link_path = urllib.parse.urlparse(working_link).path

if link_parse.path == '':

return working_link + '/'

elif working_link_no_scheme.endswith('/') == False and working_link_no_scheme == seed_base.

strip('/'):

return working_link + '/' else:

return working_link

def dict_check_for_presence(value,_dict,key,current_link):

if not value in _dict[key].keys():

_dict[key][value] = []

_dict[key][value].append(current_link) else:

if current_link not in _dict[key][value]:

_dict[key][value].append(current_link) return _dict

def edit_seed(seed):

parse_seed = urllib.parse.urlparse(seed) seed_path = urllib.parse.urlparse(seed).path if seed_path != '/':

if not '.' in seed_path:

seed_base = urllib.parse.urlparse(seed).hostname+seed_path else:

seed_base = urllib.parse.urlparse(seed).hostname else:

seed_base = urllib.parse.urlparse(seed).hostname if seed_path == '':

seed = seed+"/"

return seed, seed_base,seed_path

def fix_http_error(link):

url_parse = urllib.parse.urlparse(link) url_scheme = url_parse.scheme

if url_scheme:

if 's' in url_scheme:

return 'https://'+ link.replace(url_scheme,'').strip(':/') else:

return 'http://'+ link.replace(url_scheme,'').strip(':/') else:

if 'https' in link[:link.rfind('/')]:

split_link = list(link) split_link.insert(5,':') split_link.insert(6,'/') split_link.insert(7,'/') return ''.join(split_link)

elif 'http' in link[:link.rfind('/')]:

split_link = list(link) split_link.insert(4,':') split_link.insert(5,'/') split_link.insert(6,'/') return ''.join(split_link) else:

return link

def scrape_links_and_SM (seed, step):

print('Seed: ',seed)

121

try:

requests.get(seed, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_

4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36'},verify=False) except:

print('Seed URL error') return None, None, None

custom_cache_extract = tldextract.TLDExtract(cache_dir=None)

filters_all = ['mailto:','MAILTO:','javascript:','@','share=','facebook.com/sharer','linked in.com/shareArticle','whatsapp.com/send?','twitter.com/intent/tweet']

filters_inner = ['/comment','/post-edit','/email-post','/search/','/search?','file://','rep lytocom',

'add-to-cart=','ddownload=','tmpl=component','print=1','/winkelwagen/', '/shoppingcart/','/shopping-cart/']

filters_files = ['.jpg','.jpeg','.gif','.png','.tiff','.tif','.eps', '.exe','.xlsx','.xls','.txt','.docx','.msi','.raw','.bmp', '.mid','.mp3','.mp4','.avi','.pdf','.zip','.doc','.7z' '.ico','.csv','.odt','.ods','.ppt','.pptx',

'.epub','.flac','.ogg','.wav','.mpeg','.mov','.wmv','.flv','.webm', '.mkv']

filters_files_upper = [file.upper() for file in filters_files]

filters_inner = filters_inner + filters_files + filters_files_upper seed, seed_base, seed_path = edit_seed(seed)

print('Seed base: ',seed_base,'\n')

seed_no_punct = urllib.parse.urlparse(seed).netloc.replace('.', '_') inner_links = []

outer_links = []

frame_page = False outer_links_base = {}

links_per_level = {}

links_per_level[0] = {}

links_per_level[0]['inner'] = [seed]

inner_links.append(seed) for i in range(step):

if i not in links_per_level.keys():

pass else:

if not 'inner' in links_per_level[i].keys():

pass else:

for current_url in links_per_level[i]['inner']:

if not (i+1) in links_per_level.keys():

links_per_level[i+1] = {}

try:

session = requests.Session()

response = session.get(current_url,headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36'},verify=False)

seed_soup = BeautifulSoup(response.content, 'html.parser',from_encoding

="iso-8859-1")

#print(seed_soup)

current_page_link_tags = []

for frame in seed_soup.find_all(['frame','iframe','FRAME','IFRAME']):

if frame.has_attr('src'):

current_page_link_tags.append((frame,'src')) for a in seed_soup.find_all(['a','A','area','AREA']):

if a.has_attr('href'):

current_page_link_tags.append((a,'href')) for tag in current_page_link_tags:

#print('current url: ', current_url) #print(tag)

if tag[0].has_attr('onclick'):

122

if 'http' in tag[0]['onclick'] and tag[0][tag[1]] not in tag[0]

['onclick']:

raw_link = tag[0]['onclick']

working_link = ''

#print('raw link: ',raw_link)

#if (raw_link.strip('/') not in [link.strip('/')for link in inner_links]) or (raw_link.strip('/') not in [link.strip('/')for link in outer_links]):

working_link = raw_link[:raw_link.find('\',')][raw_link[:ra w_link.find('\',')].find('http'):].strip(' ”\'')

#else:

# pass else:

raw_link = tag[0][tag[1]]

working_link = ''

#print('raw link: ',raw_link)

#if (raw_link.strip('/') not in [link.strip('/')for link in inner_links]) or (raw_link.strip('/') not in [link.strip('/')for link in outer_links]):

working_link = tag[0][tag[1]].strip(' ”\'') #else:

# pass else:

raw_link = tag[0][tag[1]]

working_link = ''

#print('raw link: ',raw_link)

#if (raw_link.strip('/') not in [link.strip('/')for link in inn er_links]) or (raw_link.strip('/') not in [link.strip('/')for link in outer_links]):

working_link = tag[0][tag[1]].strip(' ”\'') #else:

# pass if working_link:

if working_link not in ['','/',seed,'#']:

if not any(x in working_link for x in filters_all):

working_link = fix_http_error(working_link) if 'http' not in working_link:

working_link = urllib.parse.urljoin(current_url,wor king_link)

if (urllib.parse.urlparse(working_link).hostname and '.' in str(urllib.parse.urlparse(working_link).

hostname)

and custom_cache_extract(working_link).suffix ):

if '><' in working_link:

working_link = working_link.split('><')[0]

if 'target=' in working_link:

working_link = working_link[:working_link.find ('target=')]

if '#' in working_link:

working_link = working_link[:working_link.find ('#')]

working_link = correct_capitalization_error(working _link)

working_link, temp_seed_base = check_subdomain_disc repancies(working_link,seed_base,custom_cache_extract)

working_link = correct_slash_error(working_link,tem p_seed_base)

working_short = working_link[:working_link.rfind('/

')+1]

if '?' in working_short:

working_short = working_short[:working_short.rf ind('?')+1]

if temp_seed_base in working_short:

123

if not any(x in working_link for x in filters_i nner):

working_link = check_http_discrepancies(wor king_link,seed,temp_seed_base)

#print('working link: ', working_link, '\n') if working_link.strip('/') not in [link.str ip('/') for link in inner_links]:

links_per_level = dict_check_for_presen ce('inner',links_per_level,i+1,working_link)

inner_links.append(working_link) else:

#print('\n') pass

else:

#print('working link: ', working_link, '\n') linkparse = urllib.parse.urlparse(working_link) if linkparse.hostname:

if linkparse.hostname.replace('.html','').r eplace('.','').replace(':','').isnumeric() == False:

link_base = linkparse.hostname

if link_base not in outer_links_base.ke ys():

outer_links_base[link_base] = 1 else:

outer_links_base[link_base] += 1 if working_link.strip('/') not in [link.

strip('/') for link in outer_links]:

links_per_level = dict_check_for_pr esence('outer',links_per_level,i+1,working_link)

outer_links.append(working_link)

else:

#print('\n') pass

else:

#print('\n') pass

else:

#print('\n') pass

except:

pass if seed_path != '/' and seed_path != '':

log_name = 'Random_link_logs/'+seed_no_punct+'_'+seed_path.strip('/').replace('/','_')+

'_linklog.log' else:

log_name = 'Thesis_2022/Random_link_logs/'+seed_no_punct+'_linklog.log' pretty_dict_str = pp.pformat(links_per_level)

links_per_level = {k: v for k, v in links_per_level.items() if v}

return links_per_level, outer_links_base, log_name reli_pd = pd.read_excel(r'Websites_corpus.xlsx')[:655]

reli_pd['Category secondary + tetriary'] = [t.replace('.','') for t in reli_pd['Category second ary + tetriary']]

cats=list(reli_pd['Category secondary + tetriary'])

#currentpos=0

for t in cats.copy():

if ';' in t:

split = t.split(';') cats.remove(t) for s in split:

124

cats.append(s)

# currentpos+=1

categories_secondary = [cat.partition(';')[0] for cat in reli_pd['Category secondary + tetriary ']]

categories_tetriary = [cat.partition(';')[2] for cat in reli_pd['Category secondary + tetriary ']]

for idx, x in enumerate(categories_tetriary):

if x == '':

categories_tetriary[idx] = categories_secondary[idx]

christianities = ['Protestantism','Oriental Orthodox Churches','Roman Catholicism', 'Eastern-Orthodox churches']

categories_primary = [cat if cat not in christianities else 'Christianity' for cat in categorie s_secondary]

reli_pd.insert(19, "Categories primary", categories_primary, True) reli_pd.insert(20, "Categories secondary", categories_secondary, True) reli_pd.insert(21, "Categories tetriary", categories_tetriary, True)

reli_pd_cats = reli_pd[['URL','C2: Stroming','Categories primary','Categories secondary','Categ ories tetriary','Language(s)']]

reli_pd_cats['URL'] = [url.strip('/') for url in reli_pd_cats['URL']]

primary_language = [lang.split(',')[0] for lang in reli_pd_cats['Language(s)']]

reli_pd_cats.insert(1, "Primary language", primary_language, True) reli_pd_cats.drop('Language(s)', axis=1,inplace=True)

logs = glob.glob(r'Link_logs\*.log')

URL_list_scraped = []

raw_links_count_list = []

raw_base_outer_list = []

first_level_list = []

seed_list = []

for log in logs:

with open(log,encoding='latin-1') as infile:

log = infile.read() current_level = []

raw_links_count = ast.literal_eval(log.split('\n\n')[3].strip('Raw links counts: ')) raw_base_outer = ast.literal_eval(log.split('\n\n')[4].strip('Raw base outer links: ')) raw_link_level = ast.literal_eval(log.split('\n\n')[5].strip('Raw link levels: ')) seed_link = raw_link_level[0]['inner'][0].strip('/')

seed_list.append(seed_link)

URL_list_scraped.append(seed_link.strip('/'))

raw_links_count_list.append((seed_link,raw_links_count)) raw_base_outer_list.append((seed_link,raw_base_outer)) if 1 in raw_link_level.keys():

if 'inner' in raw_link_level[1].keys():

first_level = raw_link_level[1]['inner'] first_level.append(seed_link)

first_level_list.append(first_level) else:

first_level_list.append([seed_link]) else:

first_level_list.append([seed_link])

125

data = {'URL':seed_list,

'Base outer links':raw_base_outer_list, 'First level inner links':first_level_list}

level1_db = pd.DataFrame(data=data)

for link in reli_pd_cats['URL']:

if link not in URL_list_scraped:

reli_pd_cats = reli_pd_cats.drop(reli_pd_cats['URL'][reli_pd_cats['URL'] == link].index [0])

reli_pd_cats = reli_pd_cats.reset_index(drop=True)

Unique_inner_links = []

Unique_outer_links = []

Unique_base_outer_links = []

Total_outer_links = []

for link in raw_links_count_list:

Unique_inner_links.append(link[1]['Unique inner links:']) Unique_outer_links.append(link[1]['Unique outer links'])

Unique_base_outer_links.append(link[1]['Unique base outer links:']) Total_outer_links.append(link[1]['Total outer links'])

data = {'URL':URL_list_scraped,

'Unique inner links':Unique_inner_links, 'Unique outer links':Unique_outer_links,

'Unique base outer links':Unique_base_outer_links, 'Total outer links':Total_outer_links}

link_counts_df = pd.DataFrame(data=data)

link_counts_cats_df = link_counts_df.merge(reli_pd_cats, how='inner', on='URL') link_counts_cats_df = link_counts_cats_df.merge(level1_db, how='inner', on='URL')

#Need to keep this because of bug

link_counts_cats_df.drop(link_counts_cats_df.index[(link_counts_cats_df['Unique inner links'] =

= 0) & (link_counts_cats_df['Unique outer links'] < 5)], inplace=True) link_counts_cats_df = link_counts_cats_df.reset_index(drop=True)

domains = []

for url in link_counts_cats_df['URL']:

domains.append(tldextract.extract(url)[2]) base_outer_domains = []

for row in link_counts_cats_df['Base outer links']:

base_outer_domains.append([tldextract.extract(url)[2] for url in row[1].keys()])

link_counts_cats_df.insert(1, "URL domain", domains, True)

link_counts_cats_df.insert(11, "Base outer domains", base_outer_domains, True) link_counts_cats_df

filters_files = ['.jpg','.jpeg','.gif','.png','.tiff','.tif','.eps', '.exe','.xlsx','.xls','.txt','.docx','.msi','.raw','.bmp', '.mid','.mp3','.mp4','.avi','.pdf','.zip','.doc','.7z' '.ico','.csv','.odt','.ods','.ppt','.pptx',

'.epub','.flac','.ogg','.wav','.mpeg','.mov','.wmv','.flv','.webm', '.mkv']

with open(r'paragraphs.txt', 'w',encoding="utf-8") as fp:

row_paragraphs_list = []

for row in link_counts_cats_df['First level inner links']:

paragraphs_list = []

for url in row:

126

if not any(_format in url for _format in filters_files):

try:

session = requests.Session()

response = session.get(url,headers={'User-Agent': 'Mozilla/5.0 (Macintosh;

Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1 916.153 Safari/537.3 6'},verify=False)

print(url)

current_page_paras = []

seed_soup = BeautifulSoup(response.content, 'html.parser') for para in seed_soup.find_all("p"):

current_page_paras.append(para.get_text()) #final_page_para = '\n'.join(current_page_paras) paragraphs_list.append(current_page_paras) except:

print(url,' failed') continue

#row_paragraphs_list.append(paragraphs_list)

fp.write(url+'SPLITHEREFORURLANDPARAS'+str(paragraphs_list)+'PYTHONSPLITHEREPLEASETHANK YOU')

with open(r'paragraphs.txt',encoding="utf-8") as infile:

paragraphs = infile.read()

paragraphs_list=paragraphs.split('PYTHONSPLITHEREPLEASETHANKYOU')[:613]

paragraphs_list=[(par.split('SPLITHEREFORURLANDPARAS')[0],par.split('SPLITHEREFORURLANDPARAS') [1]) for par in paragraphs_list]

paragraphs_listoflists = [ast.literal_eval(para) for para in paragraphs_df['Paragraph']]

paragraphs_strings = []

for site in paragraphs_listoflists:

site_list = []

for page in site:

page_string = ' '.join(page) site_list.append(page_string) site_string = ' '.join(site_list) paragraphs_strings.append(site_string)

paragraphs_strings = [para.strip(' ') for para in paragraphs_strings]

para_languages = []

for site in paragraphs_strings:

try:

para_languages.append(detect(site)) except:

para_languages.append('')

paragraphs_df.drop('Paragraph', axis=1,inplace=True)

paragraphs_df.insert(1,"Paragraph lists", paragraphs_listoflists) paragraphs_df.insert(2,"Paragraph strings", paragraphs_strings) paragraphs_df.insert(3,"Paragraph main language", para_languages) paragraphs_df

link_counts_cats_df_updated = link_counts_cats_df.merge(paragraphs_df, how='inner', on='URL') link_counts_cats_df_updated

#Removing sites that upon manual inspection did not result anything of note

failed_sites = ['https://kerkdienstgemist.nl','http://tesfaberhaneca.com/EECUN','http://www.evp -voices.com','https://www.harmonia-nl.org','https://zinweb.nl']

dutch_links_df = link_counts_cats_df_updated.loc[link_counts_cats_df_updated['Paragraph main la nguage']=='nl'][['URL','C2: Stroming','Categories primary','Categories secondary',

127

'Categories tetriary','Paragraph lists','Paragraph strings']]

dutch_links_df = dutch_links_df.loc[~dutch_links_df['URL'].isin(failed_sites)]

dutch_links_df = dutch_links_df.reset_index(drop=True)

binary_categories_list = []

for val in dutch_links_df['Categories primary']:

if val == 'Christianity':

binary_categories_list.append('Christianity') else:

binary_categories_list.append('Not Christianity')

dutch_links_df.insert(1, "Categories Chr binary", binary_categories_list) dutch_links_df

page_size = []

av_par_lens = []

for row in dutch_links_df['Paragraph lists']:

page_size.append(len(row)) par_lens = []

for par in row:

par_lens.append(len(par))

av_par_lens.append(round((sum(par_lens)/len(par_lens))))

dutch_links_df.insert(8, "Number of pages", page_size)

dutch_links_df.insert(9, "Av number of paragraphs", av_par_lens) dutch_links_df

#Removing all Not Christianity rows which do have Christianity in their complete category list dutch_links_df = dutch_links_df.loc[(dutch_links_df['Categories Chr binary'] == 'Christianity')

|((dutch_links_df['Categories Chr binary'] == 'Not Christianity')&(~dutch_links_df['C2: Stromin g'].str.contains('Christendom')))]

dutch_links_df = dutch_links_df.reset_index(drop=True) dutch_links_df

dutch_links_df['Categories Chr binary'].value_counts() def no_http(url):

url=url.replace('http://','') url=url.replace('https://','') return url

nl_christian_outer_links = []

corpus_URLs = list(link_counts_cats_df['URL'].apply(no_http))

filter_platforms = ['www.facebook.com','www.instagram.com','www.youtube.com','soundcloud.com',' w.soundcloud.com','api.whatsapp.com',

'twitter.com','google.com','google.nl','youtu.be', 'vimeo.com','pinterest.co m', 'plus.google.com','www.marktplaats.nl']

for row in link_counts_cats_df.loc[link_counts_cats_df['Categories primary'] == 'Christianity']

['Base outer links']:

for site in list(row[1].keys()):

if (site not in corpus_URLs) and (site not in filter_platforms):

if site.endswith('.nl'):

if site not in nl_christian_outer_links:

nl_christian_outer_links.append(site) succesful_logs = 0

attempts = 0

while succesful_logs < 101:

random.seed(attempts)

random_URL_selection = random.choice(nl_christian_outer_links)

link_dict, base_outer_links, log_name = scrape_links_and_SM('http://'+random_URL_selection,

128

1)

if link_dict and base_outer_links and log_name:

if 'inner' in link_dict[1].keys():

if len(link_dict[1]['inner']) >= 5:

outerlinks_count = 0 outerlinks_total_count = 0

for k,v in base_outer_links.items():

outerlinks_count+=1

outerlinks_total_count +=v link_counts=[-1,0]

for key,value in link_dict.items():

if 'inner' in value.keys():

link_counts[0] += len(value['inner']) if 'outer' in value.keys():

link_counts[1] += len(value['outer'])

links_count_dict = {'Unique inner links:':link_counts[0],'Unique outer links':l ink_counts[1],

'Unique base outer links:':outerlinks_count,'Total outer li nks':outerlinks_total_count}

link_dict_print_op = pp.pformat(link_dict) print_split = link_dict_print_op.split(',') for s in print_split:

print_split[print_split.index(s)] = s.replace(' \'\n \'',' ') reconstr_print = ','.join(print_split)

with open(log_name, "w") as log_file:

pp.pprint(links_count_dict,log_file,sort_dicts=False,width=1) log_file.write('\n')

pp.pprint(base_outer_links, log_file,width=1) log_file.write('\n')

log_file.write(reconstr_print) log_file.write('\n\n')

log_file.write('Raw links counts: '+str(links_count_dict)) log_file.write('\n\n')

log_file.write('Raw base outer links: '+str(base_outer_links)) log_file.write('\n\n')

log_file.write('Raw link levels: '+str(link_dict)) succesful_logs += 1

attempts += 1 else:

print('http://'+random_URL_selection, ' is too small.') attempts += 1

else:

print('http://'+random_URL_selection, ' has no inner links.') attempts += 1

else:

print('http://'+random_URL_selection, ' gives error.') attempts += 1

logs_random_christ = glob.glob(r'Random_link_logs\*.log')

URL_list_scraped = []

raw_links_count_list = []

raw_base_outer_list = []

first_level_list = []

seed_list = []

for log in logs_random_christ:

with open(log,encoding='latin-1') as infile:

log = infile.read() current_level = []

129

raw_links_count = ast.literal_eval(log.split('\n\n')[3].strip('Raw links counts: ')) raw_base_outer = ast.literal_eval(log.split('\n\n')[4].strip('Raw base outer links: ')) raw_link_level = ast.literal_eval(log.split('\n\n')[5].strip('Raw link levels: ')) seed_link = raw_link_level[0]['inner'][0].strip('/')

seed_list.append(seed_link)

URL_list_scraped.append(seed_link.strip('/'))

raw_links_count_list.append((seed_link,raw_links_count)) raw_base_outer_list.append((seed_link,raw_base_outer)) if 1 in raw_link_level.keys():

if 'inner' in raw_link_level[1].keys():

first_level = raw_link_level[1]['inner'] first_level.append(seed_link)

first_level_list.append(first_level) else:

first_level_list.append([seed_link]) else:

first_level_list.append([seed_link])

data = {'URL':seed_list,

'Base outer links':raw_base_outer_list, 'First level inner links':first_level_list}

random_level1_db = pd.DataFrame(data=data) random_level1_db

with open(r'C:\Users\Frank\Documents\Thesis_2022\paragraphs_random_100.txt',encoding="utf-8") a s infile:

paragraphs = infile.read()

paragraphs_random_list=paragraphs.split('PYTHONSPLITHEREPLEASETHANKYOU')[:100]

paragraphs_random_list=[(par.split('SPLITHEREFORURLANDPARAS')[0],par.split('SPLITHEREFORURLANDP ARAS')[1]) for par in paragraphs_random_list]

paragraphs_random_df = pd.DataFrame(data=paragraphs_random_list,columns=['URL','Paragraph']) paragraphs_random_df

paragraphs_random_listoflists = [ast.literal_eval(para) for para in paragraphs_random_df['Parag raph']]

paragraphs_random_strings = []

for site in paragraphs_random_listoflists:

site_list = []

for page in site:

page_string = ' '.join(page) site_list.append(page_string) site_string = ' '.join(site_list)

paragraphs_random_strings.append(site_string)

paragraphs_random_strings = [para.strip(' ') for para in paragraphs_random_strings]

para_random_languages = []

for site in paragraphs_random_strings:

try:

para_random_languages.append(detect(site)) except:

para_random_languages.append('')

paragraphs_random_df.drop('Paragraph', axis=1,inplace=True)

paragraphs_random_df.insert(1,"Paragraph lists", paragraphs_random_listoflists) paragraphs_random_df.insert(2,"Paragraph strings", paragraphs_random_strings) paragraphs_random_df.insert(3,"Paragraph main language", para_random_languages) paragraphs_random_df

130

paragraphs_random_df_filtered=paragraphs_random_df.loc[(paragraphs_random_df['Paragraph strings '] != '')&(paragraphs_random_df['Paragraph main language'] == 'nl')]

#with open(r'C:\Users\Frank\Documents\Thesis_2022\random100_urls_labels.txt', 'w') as write_fil e:

# for url in paragraphs_random_df_filtered['URL']:

# write_file.write(url+',\n')

with open(r'random100_urls_labels.txt') as infile:

rows = infile.read().split('\n')[:-1]

rows = [row.split(',')[1] for row in rows]

rows = ['Christianity' if row=='1' else 'Not Christianity' for row in rows]

paragraphs_random_df_filtered.insert(4,'Label',rows)

paragraphs_random_df_filtered

paragraph_predictor = dutch_links_df['Paragraph strings']

category_response = dutch_links_df['Categories Chr binary']

with open(r'stopwords-nl.txt',encoding="utf-8") as infile:

dutch_stopwords = infile.read().split('\n')

metric_per_gram = []

folds = StratifiedKFold(n_splits=10,shuffle = True, random_state = 100) metrics=[]

for train_index, test_index in folds.split(paragraph_predictor, category_response):

#print("TRAIN:", train_index, "TEST:", test_index)

par_train, par_test = paragraph_predictor[train_index], paragraph_predictor[test_index]

cat_train, cat_test = category_response[train_index], category_response[test_index]

cnt_vectorizer_unigram = CountVectorizer(stop_words=dutch_stopwords,ngram_range=(1,2)) par_train_vectors = cnt_vectorizer_unigram.fit_transform(par_train)

par_test_vectors = cnt_vectorizer_unigram.transform(par_test) svc_init = SVC(random_state=42, kernel='linear')

svc_trained = svc_init.fit(par_train_vectors, cat_train) cat_test_predict = svc_trained.predict(par_test_vectors) #for cat1,cat2 in zip(cat_test,cat_test_predict):

#print(cat1,cat2,sep=' ')

#print(accuracy_score(cat_test, cat_test_predict))

metrics.append(accuracy_score(cat_test, cat_test_predict)) metrics = np.array(metrics)

print('Mean accuracy: ', np.mean(metrics, axis=0)) print('Std for accuracy: ', np.std(metrics, axis=0))

folds = StratifiedKFold(n_splits=10,shuffle = True, random_state = 100) metrics=[]

for train_index, test_index in folds.split(paragraph_predictor, category_response):

#print("TRAIN:", train_index, "TEST:", test_index)

par_train, par_test = paragraph_predictor[train_index], paragraph_predictor[test_index]

cat_train, cat_test = category_response[train_index], category_response[test_index]

tfidf_vectorizer_bigram = TfidfVectorizer(analyzer='word',stop_words=dutch_stopwords,ngram_

range=(1,2))

par_train_vectors = tfidf_vectorizer_bigram.fit_transform(par_train) par_test_vectors = tfidf_vectorizer_bigram.transform(par_test) svc_init = SVC(random_state=42, kernel='linear')

svc_trained = svc_init.fit(par_train_vectors, cat_train) cat_test_predict = svc_trained.predict(par_test_vectors) #for cat1,cat2 in zip(cat_test,cat_test_predict):

#print(cat1,cat2,sep=' ')

print(accuracy_score(cat_test, cat_test_predict))

metrics.append(accuracy_score(cat_test, cat_test_predict))

131

metrics = np.array(metrics)

print('Mean accuracy: ', np.mean(metrics, axis=0)) print('Std for accuracy: ', np.std(metrics, axis=0)) paragraph_predictor = dutch_links_df['Paragraph strings']

category_response = dutch_links_df['Categories Chr binary']

random_paragraph_predictor = paragraphs_random_df_filtered['Paragraph strings']

random_category_response = paragraphs_random_df_filtered['Label']

tfidf_vectorizer_bigram = TfidfVectorizer(analyzer='word',stop_words=dutch_stopwords,ngram_rang e=(1,2))

model_vectors = tfidf_vectorizer_bigram.fit_transform(paragraph_predictor) random_vectors = tfidf_vectorizer_bigram.transform(random_paragraph_predictor)

svc_init = SVC(random_state=42, kernel='linear')

svc_trained = svc_init.fit(model_vectors, category_response)

cat_test_predict = svc_trained.predict(random_vectors)

for cat1,cat2 in zip(random_category_response,cat_test_predict):

print(cat1,cat2,sep=' ')

print(classification_report(random_category_response, cat_test_predict))