• No results found

10 Appendix

10.1 Python code

10.1.3 Link analysis

108

links_per_level = {k: v for k, v in links_per_level.items() if v}

return links_per_level, outer_links_base, log_name

for seed in logs:

link_dict, base_outer_links, log_name = scrape_links_and_SM(seed, 4) if link_dict and base_outer_links and log_name:

outerlinks_count = 0 outerlinks_total_count = 0

for k,v in base_outer_links.items():

outerlinks_count+=1

outerlinks_total_count +=v link_counts=[-1,0]

for key,value in link_dict.items():

if 'inner' in value.keys():

link_counts[0] += len(value['inner']) if 'outer' in value.keys():

link_counts[1] += len(value['outer'])

links_count_dict = {'Unique inner links:':link_counts[0],'Unique outer links':link_coun ts[1],

'Unique base outer links:':outerlinks_count,'Total outer links':out erlinks_total_count}

link_dict_print_op = pp.pformat(link_dict) print_split = link_dict_print_op.split(',') for s in print_split:

print_split[print_split.index(s)] = s.replace(' \'\n \'',' ') reconstr_print = ','.join(print_split)

with open(log_name, "w") as log_file:

pprint.pprint(links_count_dict,log_file,sort_dicts=False,width=1) log_file.write('\n')

pprint.pprint(base_outer_links, log_file,width=1) log_file.write('\n')

log_file.write(reconstr_print) log_file.write('\n\n')

log_file.write('Raw links counts: '+str(links_count_dict)) log_file.write('\n\n')

log_file.write('Raw base outer links: '+str(base_outer_links)) log_file.write('\n\n')

log_file.write('Raw link levels: '+str(link_dict)) else:

print(seed, ' gives error.')

109

import urllib3

from collections import Counter import pprint as pp

import json import ast

import tldextract import seaborn as sns

from matplotlib import pyplot as plt

%matplotlib inline

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) reli_pd = pd.read_excel(r'Websites_corpus.xlsx')[:655]

reli_pd['Category secondary + tetriary'] = [t.replace('.','') for t in reli_pd['Category second ary + tetriary']]

cats=list(reli_pd['Category secondary + tetriary'])

for t in cats.copy():

if ';' in t:

split = t.split(';') cats.remove(t) for s in split:

cats.append(s)

categories_secondary = [cat.partition(';')[0] for cat in reli_pd['Category secondary + tetriary ']]

categories_tetriary = [cat.partition(';')[2] for cat in reli_pd['Category secondary + tetriary ']]

for idx, x in enumerate(categories_tetriary):

if x == '':

categories_tetriary[idx] = categories_secondary[idx]

christianities = ['Protestantism','Oriental Orthodox Churches','Roman Catholicism', 'Eastern-Orthodox churches']

others = ['Native traditions','Non-theism','Mystic traditions','General','Secondary traditional religions']

categories_primary = [cat if cat not in christianities else 'Christianity' for cat in categorie s_secondary]

categories_primary = [cat if cat not in others else 'Other' for cat in categories_primary]

reli_pd.insert(19, "Categories primary", categories_primary, True) reli_pd.insert(20, "Categories secondary", categories_secondary, True) reli_pd.insert(21, "Categories tetriary", categories_tetriary, True)

reli_pd_cats = reli_pd[['URL','Categories primary','Categories secondary','Categories tetriary', 'Language(s)']]

reli_pd_cats['URL'] = [url.strip('/') for url in reli_pd_cats['URL']]

primary_language = [lang.split(',')[0] for lang in reli_pd_cats['Language(s)']]

reli_pd_cats.insert(1, "Primary language", primary_language, True) reli_pd_cats.drop('Language(s)', axis=1,inplace=True)

reli_pd_cats

logs = glob.glob(r'Link_logs\*.log')

filters_all = ['mailto:','MAILTO:','javascript:','@','share=','facebook.com/sharer','linkedin.c om/shareArticle',

'whatsapp.com/send?','wa.me/?','twitter.com/intent/tweet','www.twitter.com/share ','www.pinterest.com/pin',

110

'reddit.com/submit','telegram.me/share']

filters_inner = ['/comment','/post-edit','/email-post','/search/','/search?','file://','replyto com',

'add-to-cart=','ddownload=','tmpl=component','print=1','/winkelwagen/', '/shoppingcart/','/shopping-cart/']

filters_files = ['.jpg','.jpeg','.gif','.png','.tiff','.tif','.eps', '.exe','.xlsx','.xls','.txt','.docx','.msi','.raw','.bmp', '.mid','.mp3','.mp4','.avi','.pdf','.zip','.doc','.7z' '.ico','.csv','.odt','.ods','.ppt','.pptx',

'.epub','.flac','.ogg','.wav','.mpeg','.mov','.wmv','.flv','.webm', '.mkv']

filters_files_upper = [file.upper() for file in filters_files]

filters_inner = filters_inner + filters_files + filters_files_upper

URL_list_scraped = []

raw_links_count_list = []

raw_base_outer_list = []

first_level_list = []

seed_list = []

for log_name in logs:

with open(log_name,encoding='latin-1') as infile:

log = infile.read() current_level = []

raw_links_count = ast.literal_eval(log.split('\n\n')[3].strip('Raw links counts: ')) raw_base_outer = ast.literal_eval(log.split('\n\n')[4].strip('Raw base outer links: ')) raw_link_level = ast.literal_eval(log.split('\n\n')[5].strip('Raw link levels: '))

#This part is to clean the data, as filtering of files within inlinks and social media 'sha res' within outlinks

#hasn't adequately happened or new filters have been added after the harvest. The theory fo r also removing equally for Total outer links is that each social media share link

#is customized for the page its at, meaning it is used only once.

for i in range(1,5):

if i in raw_link_level.keys():

if 'inner' in raw_link_level[i].keys():

before_inner_len = len(raw_link_level[i]['inner'])

raw_link_level[i]['inner']= [val for val in raw_link_level[i]['inner'] if not a ny(x in val for x in filters_inner)]

after_inner_len = len(raw_link_level[i]['inner']) inner_difference = before_inner_len-after_inner_len #print(inner_difference)

raw_links_count['Unique inner links:'] = raw_links_count['Unique inner links:']

- inner_difference

if 'outer' in raw_link_level[i].keys():

before_outer_len = len(raw_link_level[i]['outer']) current_outer = raw_link_level[i]['outer']

raw_link_level[i]['outer']= [val for val in raw_link_level[i]['outer'] if not a ny(x in val for x in filters_all)]

after_outer_len = len(raw_link_level[i]['outer']) outer_difference = before_outer_len-after_outer_len

raw_links_count['Unique outer links'] = raw_links_count['Unique outer links'] outer_difference

raw_links_count['Total outer links'] = raw_links_count['Total outer links'] - o uter_difference

#print(raw_links_count['Unique outer links'],raw_links_count['Total outer links '])

for link in current_outer:

if link not in raw_link_level[i]['outer']:

link_base = urllib.parse.urlparse(link).hostname if link_base in raw_base_outer.keys():

111

raw_base_outer[link_base] =- 1

for base, count in zip(list(raw_base_outer.keys()),list(raw_base_outer.values ())):

if count <= 0:

del raw_base_outer[base]

#print(raw_base_outer)

seed_link = raw_link_level[0]['inner'][0].strip('/') seed_list.append(seed_link)

URL_list_scraped.append(seed_link.strip('/'))

raw_links_count_list.append((seed_link,raw_links_count)) raw_base_outer_list.append((seed_link,raw_base_outer)) if 1 in raw_link_level.keys():

if 'inner' in raw_link_level[1].keys():

first_level = raw_link_level[1]['inner'] first_level.append(seed_link)

first_level_list.append(first_level) else:

first_level_list.append([seed_link]) else:

first_level_list.append([seed_link]) data = {'URL':seed_list,

'Base outer links':raw_base_outer_list, 'First level inner links':first_level_list}

level1_db = pd.DataFrame(data=data) level1_db

#Drop links that couldn't be scraped for link in reli_pd_cats['URL']:

if link not in URL_list_scraped:

reli_pd_cats = reli_pd_cats.drop(reli_pd_cats['URL'][reli_pd_cats['URL'] == link].index [0])

reli_pd_cats = reli_pd_cats.reset_index(drop=True) reli_pd_cats

Unique_inner_links = []

Unique_outer_links = []

Unique_base_outer_links = []

Total_outer_links = []

for link in raw_links_count_list:

Unique_inner_links.append(link[1]['Unique inner links:']) Unique_outer_links.append(link[1]['Unique outer links'])

Unique_base_outer_links.append(link[1]['Unique base outer links:']) Total_outer_links.append(link[1]['Total outer links'])

data = {'URL':URL_list_scraped,

'Unique inner links':Unique_inner_links, 'Unique outer links':Unique_outer_links,

'Unique base outer links':Unique_base_outer_links, 'Total outer links':Total_outer_links}

link_counts_df = pd.DataFrame(data=data)

link_counts_cats_df = link_counts_df.merge(reli_pd_cats, how='inner', on='URL') link_counts_cats_df = link_counts_cats_df.merge(level1_db, how='inner', on='URL') link_counts_cats_df

112

#Drop rows whose inner links are 0 and outer links are less than 5

link_counts_cats_df.drop(link_counts_cats_df.index[(link_counts_cats_df['Unique inner links'] =

= 0) & (link_counts_cats_df['Unique outer links'] < 5)], inplace=True) link_counts_cats_df = link_counts_cats_df.reset_index(drop=True) link_counts_cats_df

##Add the domains as a seperate column domains = []

for url in link_counts_cats_df['URL']:

domains.append(tldextract.extract(url)[2]) base_outer_domains = []

for row in link_counts_cats_df['Base outer links']:

base_outer_domains.append([tldextract.extract(url)[2] for url in row[1].keys()])

link_counts_cats_df.insert(1, "URL domain", domains, True)

link_counts_cats_df.insert(10, "Base outer domains", base_outer_domains, True)

link_counts_cats_df

dict(Counter(link_counts_cats_df['URL domain']))

domains_series = link_counts_cats_df.loc[link_counts_cats_df['URL domain'] == 'nl']['Base outer domains']

total_domains = [item for sublist in domains_series for item in sublist]

#print(sum(dict(Counter(total_domains)).values()))

#print(len(dict(Counter(total_domains)).keys()))

#print(dict(Counter(total_domains)))

domains_series = link_counts_cats_df.loc[link_counts_cats_df['URL domain'] == 'com']['Base oute r domains']

total_domains = [item for sublist in domains_series for item in sublist]

#print(sum(dict(Counter(total_domains)).values()))

#print(len(dict(Counter(total_domains)).keys()))

#print(dict(Counter(total_domains)))

domains_series = link_counts_cats_df.loc[link_counts_cats_df['URL domain'] == 'org']['Base oute r domains']

total_domains = [item for sublist in domains_series for item in sublist]

#print(sum(dict(Counter(total_domains)).values()))

#print(len(dict(Counter(total_domains)).keys()))

#print(dict(Counter(total_domains)))

domains_series = link_counts_cats_df.loc[~link_counts_cats_df['URL domain'].isin(['nl','com', ' org'])]['Base outer domains']

#print(Counter(list(link_counts_cats_df.loc[~link_counts_cats_df['URL domain'].isin(['nl','com', 'org'])]['URL domain'])))

total_domains = [item for sublist in domains_series for item in sublist]

#print(sum(dict(Counter(total_domains)).values()))

#print(len(dict(Counter(total_domains)).keys()))

#print(dict(Counter(total_domains)))

non_zero_df = link_counts_cats_df[link_counts_cats_df['Unique inner links'] != 0]

ticks = [1,10,100,500,1000,2500,5000,10000,50000,100000,200000]

# Plotting

sns.set_style('darkgrid')

113

sns.set(font_scale = 1.3)

#g.set(xticks=ticks)

fig, ax =plt.subplots(2,1,sharex=True)

p1 = sns.boxplot(data=non_zero_df, x='Unique inner links',palette='viridis',ax=ax[0]) p1.set(xscale='log')

p1.set(xticks=ticks,xticklabels=ticks) p1.set(xlabel=None)

p2 = sns.stripplot(data=non_zero_df, x='Unique inner links',palette='viridis',ax=ax[1],jitter=0.

3)

p2.set(xscale='log')

p2.set(xticks=ticks,xticklabels=ticks) plt.subplots_adjust(wspace=0, hspace=0) plt.gcf().set_size_inches(20, 4)

print(non_zero_df['Unique inner links'].describe())

top_df = non_zero_df[['URL','Unique inner links']].sort_values('Unique inner links',ascending=F alse).reset_index(drop=True)

top_df.index += 1 print(top_df.head(10))

non_zero_df = link_counts_cats_df[link_counts_cats_df['Unique outer links'] != 0]

ticks = [1,10,100,500,1000,2500,5000,10000,50000]

# Plotting

sns.set_style('darkgrid') sns.set(font_scale = 1.3)

#g.set(xticks=ticks)

fig, ax =plt.subplots(2,1,sharex=True)

p1 = sns.boxplot(data=non_zero_df, x='Unique outer links',palette='viridis',ax=ax[0]) p1.set(xscale='log')

p1.set(xticks=ticks,xticklabels=ticks) p1.set(xlabel=None)

p2 = sns.stripplot(data=non_zero_df, x='Unique outer links',palette='viridis',ax=ax[1],jitter=0.

3)

p2.set(xscale='log')

p2.set(xticks=ticks,xticklabels=ticks) plt.subplots_adjust(wspace=0, hspace=0) plt.gcf().set_size_inches(20, 4)

print(non_zero_df['Unique outer links'].describe())

#print(str(non_zero_df['Unique outer links'].describe()).replace(' ',''))

top_df = non_zero_df[['URL','Unique outer links']].sort_values('Unique outer links',ascending=F alse).reset_index(drop=True)

top_df.index += 1

print(top_df.head(10))

#print(str(top_df.head(10)).replace(' ','').replace('http://','').replace('https://','')) non_zero_df = link_counts_cats_df[link_counts_cats_df['Unique base outer links'] != 0]

ticks = [1,10,100,500,1000,2500,5000,10000,20000]

# Plotting

sns.set_style('darkgrid') sns.set(font_scale = 1.3)

#g.set(xticks=ticks)

114

fig, ax =plt.subplots(2,1,sharex=True)

p1 = sns.boxplot(data=non_zero_df, x='Unique base outer links',palette='viridis',ax=ax[0]) p1.set(xscale='log')

p1.set(xticks=ticks,xticklabels=ticks) p1.set(xlabel=None)

p2 = sns.stripplot(data=non_zero_df, x='Unique base outer links',palette='viridis',ax=ax[1],jit ter=0.3)

p2.set(xscale='log')

p2.set(xticks=ticks,xticklabels=ticks) plt.subplots_adjust(wspace=0, hspace=0) plt.gcf().set_size_inches(20, 4)

print(non_zero_df['Unique base outer links'].describe())

top_df = non_zero_df[['URL','Unique base outer links']].sort_values('Unique base outer links',a scending=False).reset_index(drop=True)

top_df.index += 1 print(top_df.head(10))

pd.set_option('display.float_format', lambda x: '%.9f' % x)

non_zero_df = link_counts_cats_df[link_counts_cats_df['Total outer links'] != 0]

ticks = [1,10,100,500,1000,2500,5000,10000,25000,50000,100000,500000,1500000]

# Plotting

sns.set_style('darkgrid') sns.set(font_scale = 1.3)

#g.set(xticks=ticks)

fig, ax =plt.subplots(2,1,sharex=True)

p1 = sns.boxplot(data=non_zero_df, x='Total outer links',palette='viridis',ax=ax[0]) p1.set(xscale='log')

p1.set(xticks=ticks,xticklabels=ticks) p1.set(xlabel=None)

p2 = sns.stripplot(data=non_zero_df, x='Total outer links',palette='viridis',ax=ax[1],jitter=0.

3)

p2.set(xscale='log')

p2.set(xticks=ticks,xticklabels=ticks) plt.subplots_adjust(wspace=0, hspace=0) plt.gcf().set_size_inches(20, 4)

print(non_zero_df['Total outer links'].describe())

top_df = non_zero_df[['URL','Total outer links']].sort_values('Total outer links',ascending=Fal se).reset_index(drop=True)

top_df.index += 1 print(top_df.head(10)) all_base_link_counts = []

for site in link_counts_cats_df['Base outer links']:

for val in site[1].values():

all_base_link_counts.append(val)

ticks = [1,2,3,5,10,20,50,100,1000,10000,100000]

links_counts_df = pd.DataFrame(data=all_base_link_counts,columns=['count']) sns.set_style('darkgrid')

sns.set(font_scale = 1.3)

fig, ax =plt.subplots(2,1,sharex=True)

p1 = sns.boxplot(data=links_counts_df, x='count',palette='rocket',ax=ax[0]) p1.set(xscale='log')

115

p1.set(xticks=ticks,xticklabels=ticks) p1.set(xlabel='None')

p2 = sns.stripplot(data=links_counts_df, x='count',palette='rocket',ax=ax[1],jitter=0.3) p2.set(xscale='log')

p2.set(xticks=ticks,xticklabels=ticks)

p2.set(xlabel='Received links per base outer link, including duplicates (n={})'.format(len(link s_counts_df['count'])))

plt.subplots_adjust(wspace=0, hspace=0) plt.gcf().set_size_inches(20, 4)

print(links_counts_df['count'].describe()) all_base_link_counts = []

for site in link_counts_cats_df.loc[link_counts_cats_df['URL']=='http://www.christianarchy.nl']

['Base outer links']:

for val in site[1].values():

all_base_link_counts.append(val)

ticks = [1,2,3,5,10,20,50,100,1000,10000,100000]

links_counts_df = pd.DataFrame(data=all_base_link_counts,columns=['count']) sns.set_style('darkgrid')

sns.set(font_scale = 1.3)

fig, ax =plt.subplots(2,1,sharex=True)

p1 = sns.boxplot(data=links_counts_df, x='count',palette='rocket',ax=ax[0]) p1.set(xscale='log')

p1.set(xticks=ticks,xticklabels=ticks) p1.set(xlabel='None')

p2 = sns.stripplot(data=links_counts_df, x='count',palette='rocket',ax=ax[1],jitter=0.3) p2.set(xscale='log')

p2.set(xticks=ticks,xticklabels=ticks)

p2.set(xlabel='christianarchy.nl: received links per base outer link(n={})'.format(len(links_co unts_df['count'])))

plt.subplots_adjust(wspace=0, hspace=0) plt.gcf().set_size_inches(20, 4)

print(links_counts_df['count'].describe(percentiles=[.25, .5, .75,.90,.99])) non_zero_df = link_counts_cats_df[link_counts_cats_df['Unique inner links'] != 0]

non_zero_df = non_zero_df[non_zero_df['Unique outer links'] != 0]

plot_corr_sns = sns.regplot(x="Unique inner links", y="Unique outer links", color="r", data=non _zero_df)

plt.gcf().set_size_inches(15, 15) select = ['Christianity']

non_zero_df = link_counts_cats_df[link_counts_cats_df['Unique base outer links'] != 0]

non_zero_df = non_zero_df.loc[non_zero_df['Categories primary'].isin(select), :]

ticks = [1,5,10,25,50,100,250,500,1000,5000,17000]

plt.figure(figsize=(15,8)) # Set plot dimensions

p = sns.stripplot(x='Categories secondary', y='Unique base outer links',palette='viridis', data

=non_zero_df,color='grey')

p = sns.boxplot(x='Categories secondary', y='Unique base outer links',data=non_zero_df,color='w hite')

p.set(yscale='log') p.set(xlabel=None)

p.set(yticks=ticks,yticklabels=ticks)

select = ['Judaism','Islam','Hinduism','Buddhism','Secondary traditional religions']

non_zero_df = link_counts_cats_df[link_counts_cats_df['Unique base outer links'] != 0]

116

non_zero_df = non_zero_df.loc[non_zero_df['Categories secondary'].isin(select), :]

ticks = [1,5,10,25,50,100,250,500,1000]

plt.figure(figsize=(15,8)) # Set plot dimensions

p = sns.stripplot(x='Categories secondary', y='Unique base outer links',palette='viridis', data

=non_zero_df,color='grey')

p = sns.boxplot(x='Categories secondary', y='Unique base outer links',data=non_zero_df,color='w hite')

p.set(yscale='log') p.set(xlabel=None)

p.set(yticks=ticks,yticklabels=ticks)

select = ['New movements','Spirituality','Contemporary paganisms','Non-theism']

non_zero_df = link_counts_cats_df[link_counts_cats_df['Unique base outer links'] != 0]

non_zero_df = non_zero_df.loc[non_zero_df['Categories secondary'].isin(select), :]

ticks = [1,5,10,25,50,100,250,500,1000,1500]

plt.figure(figsize=(15,8)) # Set plot dimensions

p = sns.stripplot(x='Categories secondary', y='Unique base outer links',palette='viridis', data

=non_zero_df,color='grey')

p = sns.boxplot(x='Categories secondary', y='Unique base outer links',data=non_zero_df,color='w hite')

p.set(yscale='log') p.set(xlabel=None)

p.set(yticks=ticks,yticklabels=ticks) unique_links = []

nodes_tuples = []

edges_tuples = []

for index, row in link_counts_cats_df.iterrows():

nodes_tuples.append((index,row['URL'][row['URL'].find('//')+2:],row['Categories primary'],r ow['Categories secondary'],row['Categories tetriary']))

unique_links.append(row['URL'][row['URL'].find('//')+2:])

for log in logs:

current_id = nodes_tuples[-1][0]+1

with open(log,encoding='latin-1') as infile:

log = infile.read()

raw_link_level = ast.literal_eval(log.split('\n\n')[5].strip('Raw link levels: ')) seed_link = raw_link_level[0]['inner'][0].strip('/')

if seed_link in list(link_counts_cats_df['URL']):

raw_base_outer = ast.literal_eval(log.split('\n\n')[4].strip('Raw base outer links: ')) for link in raw_base_outer.keys():

if link not in unique_links:

unique_links.append(link)

nodes_tuples.append((current_id,link,'outer','outer')) current_id+=1

nodes_df = pd.DataFrame(nodes_tuples, columns =['Id', 'URL', 'Category primary','Category secon dary','Category tetriary'])

belongs_to_dataset_list = []

for row in nodes_df['Category primary']:

if row != 'outer':

belongs_to_dataset_list.append(1) else:

belongs_to_dataset_list.append(0)

nodes_df.insert(5,'Part of dataset',belongs_to_dataset_list, True)

for log in logs:

with open(log,encoding='latin-1') as infile:

log = infile.read()

117

raw_link_level = ast.literal_eval(log.split('\n\n')[5].strip('Raw link levels: ')) seed_link = raw_link_level[0]['inner'][0].strip('/')

if seed_link in list(link_counts_cats_df['URL']):

print(seed_link)

seed_link = seed_link[seed_link.find('//')+2:]

seed_id = nodes_df[nodes_df['URL'] == seed_link]['Id'].values[0]

raw_base_outer = ast.literal_eval(log.split('\n\n')[4].strip('Raw base outer links:

'))

for link, weight in raw_base_outer.items():

current_id = nodes_df[nodes_df['URL'] == link]['Id'].values[0]

edges_tuples.append((seed_id,current_id,'Directed',weight))

edges_df = pd.DataFrame(edges_tuples, columns =['Source', 'Target', 'Type','Weight'])

#Seperate script for filtering out all ties between the corpus sites

unique_links = []

nodes_tuples = []

edges_tuples = []

no_http_corpus_link = [link[link.find('//')+2:] for link in link_counts_cats_df['URL']]

for index, row in link_counts_cats_df.iterrows():

nodes_tuples.append((index,row['URL'][row['URL'].find('//')+2:],row['Categories primary'],r ow['Categories secondary'],row['Categories tetriary']))

unique_links.append(row['URL'][row['URL'].find('//')+2:])

for log in logs:

current_id = nodes_tuples[-1][0]+1

with open(log,encoding='latin-1') as infile:

log = infile.read()

seed_link = raw_link_level[0]['inner'][0].strip('/') if seed_link in list(link_counts_cats_df['URL']):

raw_base_outer = ast.literal_eval(log.split('\n\n')[4].strip('Raw base outer links: ')) for link in raw_base_outer.keys():

if link not in unique_links:

unique_links.append(link)

nodes_tuples.append((current_id,link,'outer','outer')) current_id+=1

nodes_df_no_corpus = pd.DataFrame(nodes_tuples, columns =['Id', 'URL', 'Category primary','Cate gory secondary','Category tetriary'])

belongs_to_dataset_list = []

for row in nodes_df_no_corpus['Category primary']:

if row != 'outer':

belongs_to_dataset_list.append(1) else:

belongs_to_dataset_list.append(0)

nodes_df_no_corpus.insert(5,'Part of dataset',belongs_to_dataset_list, True)

for log in logs:

with open(log,encoding='latin-1') as infile:

log = infile.read()

raw_link_level = ast.literal_eval(log.split('\n\n')[5].strip('Raw link levels: ')) seed_link = raw_link_level[0]['inner'][0].strip('/')

if seed_link in list(link_counts_cats_df['URL']):

print(seed_link)

seed_link = seed_link[seed_link.find('//')+2:]

118

seed_id = nodes_df_no_corpus[nodes_df_no_corpus['URL'] == seed_link]['Id'].values[0]

raw_base_outer = ast.literal_eval(log.split('\n\n')[4].strip('Raw base outer links:

'))

for link, weight in raw_base_outer.items():

if link not in no_http_corpus_link:

current_id = nodes_df_no_corpus[nodes_df_no_corpus['URL'] == link]['Id'].va lues[0]

edges_tuples.append((seed_id,current_id,'Directed',weight))

edges_df_no_corpus = pd.DataFrame(edges_tuples, columns =['Source', 'Target', 'Type','Weight']) nodes_df_no_corpus.to_csv(r'C:\Users\gebruiker\Documents\Thesis_2022\nodes_df_nocorpus.csv',ind ex=False)

edges_df_no_corpus.to_csv(r'C:\Users\gebruiker\Documents\Thesis_2022\edges_df_nocorpus.csv',ind ex=False)

nodes_df.to_csv(r'C:\Users\gebruiker\Documents\Thesis_2022\nodes_df.csv',index=False) edges_df.to_csv(r'C:\Users\gebruiker\Documents\Thesis_2022\edges_df.csv',index=False)