10 Appendix
10.1 Python code
10.1.3 Link analysis
108
links_per_level = {k: v for k, v in links_per_level.items() if v}
return links_per_level, outer_links_base, log_name
for seed in logs:
link_dict, base_outer_links, log_name = scrape_links_and_SM(seed, 4) if link_dict and base_outer_links and log_name:
outerlinks_count = 0 outerlinks_total_count = 0
for k,v in base_outer_links.items():
outerlinks_count+=1
outerlinks_total_count +=v link_counts=[-1,0]
for key,value in link_dict.items():
if 'inner' in value.keys():
link_counts[0] += len(value['inner']) if 'outer' in value.keys():
link_counts[1] += len(value['outer'])
links_count_dict = {'Unique inner links:':link_counts[0],'Unique outer links':link_coun ts[1],
'Unique base outer links:':outerlinks_count,'Total outer links':out erlinks_total_count}
link_dict_print_op = pp.pformat(link_dict) print_split = link_dict_print_op.split(',') for s in print_split:
print_split[print_split.index(s)] = s.replace(' \'\n \'',' ') reconstr_print = ','.join(print_split)
with open(log_name, "w") as log_file:
pprint.pprint(links_count_dict,log_file,sort_dicts=False,width=1) log_file.write('\n')
pprint.pprint(base_outer_links, log_file,width=1) log_file.write('\n')
log_file.write(reconstr_print) log_file.write('\n\n')
log_file.write('Raw links counts: '+str(links_count_dict)) log_file.write('\n\n')
log_file.write('Raw base outer links: '+str(base_outer_links)) log_file.write('\n\n')
log_file.write('Raw link levels: '+str(link_dict)) else:
print(seed, ' gives error.')
109
import urllib3
from collections import Counter import pprint as pp
import json import ast
import tldextract import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) reli_pd = pd.read_excel(r'Websites_corpus.xlsx')[:655]
reli_pd['Category secondary + tetriary'] = [t.replace('.','') for t in reli_pd['Category second ary + tetriary']]
cats=list(reli_pd['Category secondary + tetriary'])
for t in cats.copy():
if ';' in t:
split = t.split(';') cats.remove(t) for s in split:
cats.append(s)
categories_secondary = [cat.partition(';')[0] for cat in reli_pd['Category secondary + tetriary ']]
categories_tetriary = [cat.partition(';')[2] for cat in reli_pd['Category secondary + tetriary ']]
for idx, x in enumerate(categories_tetriary):
if x == '':
categories_tetriary[idx] = categories_secondary[idx]
christianities = ['Protestantism','Oriental Orthodox Churches','Roman Catholicism', 'Eastern-Orthodox churches']
others = ['Native traditions','Non-theism','Mystic traditions','General','Secondary traditional religions']
categories_primary = [cat if cat not in christianities else 'Christianity' for cat in categorie s_secondary]
categories_primary = [cat if cat not in others else 'Other' for cat in categories_primary]
reli_pd.insert(19, "Categories primary", categories_primary, True) reli_pd.insert(20, "Categories secondary", categories_secondary, True) reli_pd.insert(21, "Categories tetriary", categories_tetriary, True)
reli_pd_cats = reli_pd[['URL','Categories primary','Categories secondary','Categories tetriary', 'Language(s)']]
reli_pd_cats['URL'] = [url.strip('/') for url in reli_pd_cats['URL']]
primary_language = [lang.split(',')[0] for lang in reli_pd_cats['Language(s)']]
reli_pd_cats.insert(1, "Primary language", primary_language, True) reli_pd_cats.drop('Language(s)', axis=1,inplace=True)
reli_pd_cats
logs = glob.glob(r'Link_logs\*.log')
filters_all = ['mailto:','MAILTO:','javascript:','@','share=','facebook.com/sharer','linkedin.c om/shareArticle',
'whatsapp.com/send?','wa.me/?','twitter.com/intent/tweet','www.twitter.com/share ','www.pinterest.com/pin',
110
'reddit.com/submit','telegram.me/share']
filters_inner = ['/comment','/post-edit','/email-post','/search/','/search?','file://','replyto com',
'add-to-cart=','ddownload=','tmpl=component','print=1','/winkelwagen/', '/shoppingcart/','/shopping-cart/']
filters_files = ['.jpg','.jpeg','.gif','.png','.tiff','.tif','.eps', '.exe','.xlsx','.xls','.txt','.docx','.msi','.raw','.bmp', '.mid','.mp3','.mp4','.avi','.pdf','.zip','.doc','.7z' '.ico','.csv','.odt','.ods','.ppt','.pptx',
'.epub','.flac','.ogg','.wav','.mpeg','.mov','.wmv','.flv','.webm', '.mkv']
filters_files_upper = [file.upper() for file in filters_files]
filters_inner = filters_inner + filters_files + filters_files_upper
URL_list_scraped = []
raw_links_count_list = []
raw_base_outer_list = []
first_level_list = []
seed_list = []
for log_name in logs:
with open(log_name,encoding='latin-1') as infile:
log = infile.read() current_level = []
raw_links_count = ast.literal_eval(log.split('\n\n')[3].strip('Raw links counts: ')) raw_base_outer = ast.literal_eval(log.split('\n\n')[4].strip('Raw base outer links: ')) raw_link_level = ast.literal_eval(log.split('\n\n')[5].strip('Raw link levels: '))
#This part is to clean the data, as filtering of files within inlinks and social media 'sha res' within outlinks
#hasn't adequately happened or new filters have been added after the harvest. The theory fo r also removing equally for Total outer links is that each social media share link
#is customized for the page its at, meaning it is used only once.
for i in range(1,5):
if i in raw_link_level.keys():
if 'inner' in raw_link_level[i].keys():
before_inner_len = len(raw_link_level[i]['inner'])
raw_link_level[i]['inner']= [val for val in raw_link_level[i]['inner'] if not a ny(x in val for x in filters_inner)]
after_inner_len = len(raw_link_level[i]['inner']) inner_difference = before_inner_len-after_inner_len #print(inner_difference)
raw_links_count['Unique inner links:'] = raw_links_count['Unique inner links:']
- inner_difference
if 'outer' in raw_link_level[i].keys():
before_outer_len = len(raw_link_level[i]['outer']) current_outer = raw_link_level[i]['outer']
raw_link_level[i]['outer']= [val for val in raw_link_level[i]['outer'] if not a ny(x in val for x in filters_all)]
after_outer_len = len(raw_link_level[i]['outer']) outer_difference = before_outer_len-after_outer_len
raw_links_count['Unique outer links'] = raw_links_count['Unique outer links'] outer_difference
raw_links_count['Total outer links'] = raw_links_count['Total outer links'] - o uter_difference
#print(raw_links_count['Unique outer links'],raw_links_count['Total outer links '])
for link in current_outer:
if link not in raw_link_level[i]['outer']:
link_base = urllib.parse.urlparse(link).hostname if link_base in raw_base_outer.keys():
111
raw_base_outer[link_base] =- 1
for base, count in zip(list(raw_base_outer.keys()),list(raw_base_outer.values ())):
if count <= 0:
del raw_base_outer[base]
#print(raw_base_outer)
seed_link = raw_link_level[0]['inner'][0].strip('/') seed_list.append(seed_link)
URL_list_scraped.append(seed_link.strip('/'))
raw_links_count_list.append((seed_link,raw_links_count)) raw_base_outer_list.append((seed_link,raw_base_outer)) if 1 in raw_link_level.keys():
if 'inner' in raw_link_level[1].keys():
first_level = raw_link_level[1]['inner'] first_level.append(seed_link)
first_level_list.append(first_level) else:
first_level_list.append([seed_link]) else:
first_level_list.append([seed_link]) data = {'URL':seed_list,
'Base outer links':raw_base_outer_list, 'First level inner links':first_level_list}
level1_db = pd.DataFrame(data=data) level1_db
#Drop links that couldn't be scraped for link in reli_pd_cats['URL']:
if link not in URL_list_scraped:
reli_pd_cats = reli_pd_cats.drop(reli_pd_cats['URL'][reli_pd_cats['URL'] == link].index [0])
reli_pd_cats = reli_pd_cats.reset_index(drop=True) reli_pd_cats
Unique_inner_links = []
Unique_outer_links = []
Unique_base_outer_links = []
Total_outer_links = []
for link in raw_links_count_list:
Unique_inner_links.append(link[1]['Unique inner links:']) Unique_outer_links.append(link[1]['Unique outer links'])
Unique_base_outer_links.append(link[1]['Unique base outer links:']) Total_outer_links.append(link[1]['Total outer links'])
data = {'URL':URL_list_scraped,
'Unique inner links':Unique_inner_links, 'Unique outer links':Unique_outer_links,
'Unique base outer links':Unique_base_outer_links, 'Total outer links':Total_outer_links}
link_counts_df = pd.DataFrame(data=data)
link_counts_cats_df = link_counts_df.merge(reli_pd_cats, how='inner', on='URL') link_counts_cats_df = link_counts_cats_df.merge(level1_db, how='inner', on='URL') link_counts_cats_df
112
#Drop rows whose inner links are 0 and outer links are less than 5
link_counts_cats_df.drop(link_counts_cats_df.index[(link_counts_cats_df['Unique inner links'] =
= 0) & (link_counts_cats_df['Unique outer links'] < 5)], inplace=True) link_counts_cats_df = link_counts_cats_df.reset_index(drop=True) link_counts_cats_df
##Add the domains as a seperate column domains = []
for url in link_counts_cats_df['URL']:
domains.append(tldextract.extract(url)[2]) base_outer_domains = []
for row in link_counts_cats_df['Base outer links']:
base_outer_domains.append([tldextract.extract(url)[2] for url in row[1].keys()])
link_counts_cats_df.insert(1, "URL domain", domains, True)
link_counts_cats_df.insert(10, "Base outer domains", base_outer_domains, True)
link_counts_cats_df
dict(Counter(link_counts_cats_df['URL domain']))
domains_series = link_counts_cats_df.loc[link_counts_cats_df['URL domain'] == 'nl']['Base outer domains']
total_domains = [item for sublist in domains_series for item in sublist]
#print(sum(dict(Counter(total_domains)).values()))
#print(len(dict(Counter(total_domains)).keys()))
#print(dict(Counter(total_domains)))
domains_series = link_counts_cats_df.loc[link_counts_cats_df['URL domain'] == 'com']['Base oute r domains']
total_domains = [item for sublist in domains_series for item in sublist]
#print(sum(dict(Counter(total_domains)).values()))
#print(len(dict(Counter(total_domains)).keys()))
#print(dict(Counter(total_domains)))
domains_series = link_counts_cats_df.loc[link_counts_cats_df['URL domain'] == 'org']['Base oute r domains']
total_domains = [item for sublist in domains_series for item in sublist]
#print(sum(dict(Counter(total_domains)).values()))
#print(len(dict(Counter(total_domains)).keys()))
#print(dict(Counter(total_domains)))
domains_series = link_counts_cats_df.loc[~link_counts_cats_df['URL domain'].isin(['nl','com', ' org'])]['Base outer domains']
#print(Counter(list(link_counts_cats_df.loc[~link_counts_cats_df['URL domain'].isin(['nl','com', 'org'])]['URL domain'])))
total_domains = [item for sublist in domains_series for item in sublist]
#print(sum(dict(Counter(total_domains)).values()))
#print(len(dict(Counter(total_domains)).keys()))
#print(dict(Counter(total_domains)))
non_zero_df = link_counts_cats_df[link_counts_cats_df['Unique inner links'] != 0]
ticks = [1,10,100,500,1000,2500,5000,10000,50000,100000,200000]
# Plotting
sns.set_style('darkgrid')
113
sns.set(font_scale = 1.3)
#g.set(xticks=ticks)
fig, ax =plt.subplots(2,1,sharex=True)
p1 = sns.boxplot(data=non_zero_df, x='Unique inner links',palette='viridis',ax=ax[0]) p1.set(xscale='log')
p1.set(xticks=ticks,xticklabels=ticks) p1.set(xlabel=None)
p2 = sns.stripplot(data=non_zero_df, x='Unique inner links',palette='viridis',ax=ax[1],jitter=0.
3)
p2.set(xscale='log')
p2.set(xticks=ticks,xticklabels=ticks) plt.subplots_adjust(wspace=0, hspace=0) plt.gcf().set_size_inches(20, 4)
print(non_zero_df['Unique inner links'].describe())
top_df = non_zero_df[['URL','Unique inner links']].sort_values('Unique inner links',ascending=F alse).reset_index(drop=True)
top_df.index += 1 print(top_df.head(10))
non_zero_df = link_counts_cats_df[link_counts_cats_df['Unique outer links'] != 0]
ticks = [1,10,100,500,1000,2500,5000,10000,50000]
# Plotting
sns.set_style('darkgrid') sns.set(font_scale = 1.3)
#g.set(xticks=ticks)
fig, ax =plt.subplots(2,1,sharex=True)
p1 = sns.boxplot(data=non_zero_df, x='Unique outer links',palette='viridis',ax=ax[0]) p1.set(xscale='log')
p1.set(xticks=ticks,xticklabels=ticks) p1.set(xlabel=None)
p2 = sns.stripplot(data=non_zero_df, x='Unique outer links',palette='viridis',ax=ax[1],jitter=0.
3)
p2.set(xscale='log')
p2.set(xticks=ticks,xticklabels=ticks) plt.subplots_adjust(wspace=0, hspace=0) plt.gcf().set_size_inches(20, 4)
print(non_zero_df['Unique outer links'].describe())
#print(str(non_zero_df['Unique outer links'].describe()).replace(' ',''))
top_df = non_zero_df[['URL','Unique outer links']].sort_values('Unique outer links',ascending=F alse).reset_index(drop=True)
top_df.index += 1
print(top_df.head(10))
#print(str(top_df.head(10)).replace(' ','').replace('http://','').replace('https://','')) non_zero_df = link_counts_cats_df[link_counts_cats_df['Unique base outer links'] != 0]
ticks = [1,10,100,500,1000,2500,5000,10000,20000]
# Plotting
sns.set_style('darkgrid') sns.set(font_scale = 1.3)
#g.set(xticks=ticks)
114
fig, ax =plt.subplots(2,1,sharex=True)
p1 = sns.boxplot(data=non_zero_df, x='Unique base outer links',palette='viridis',ax=ax[0]) p1.set(xscale='log')
p1.set(xticks=ticks,xticklabels=ticks) p1.set(xlabel=None)
p2 = sns.stripplot(data=non_zero_df, x='Unique base outer links',palette='viridis',ax=ax[1],jit ter=0.3)
p2.set(xscale='log')
p2.set(xticks=ticks,xticklabels=ticks) plt.subplots_adjust(wspace=0, hspace=0) plt.gcf().set_size_inches(20, 4)
print(non_zero_df['Unique base outer links'].describe())
top_df = non_zero_df[['URL','Unique base outer links']].sort_values('Unique base outer links',a scending=False).reset_index(drop=True)
top_df.index += 1 print(top_df.head(10))
pd.set_option('display.float_format', lambda x: '%.9f' % x)
non_zero_df = link_counts_cats_df[link_counts_cats_df['Total outer links'] != 0]
ticks = [1,10,100,500,1000,2500,5000,10000,25000,50000,100000,500000,1500000]
# Plotting
sns.set_style('darkgrid') sns.set(font_scale = 1.3)
#g.set(xticks=ticks)
fig, ax =plt.subplots(2,1,sharex=True)
p1 = sns.boxplot(data=non_zero_df, x='Total outer links',palette='viridis',ax=ax[0]) p1.set(xscale='log')
p1.set(xticks=ticks,xticklabels=ticks) p1.set(xlabel=None)
p2 = sns.stripplot(data=non_zero_df, x='Total outer links',palette='viridis',ax=ax[1],jitter=0.
3)
p2.set(xscale='log')
p2.set(xticks=ticks,xticklabels=ticks) plt.subplots_adjust(wspace=0, hspace=0) plt.gcf().set_size_inches(20, 4)
print(non_zero_df['Total outer links'].describe())
top_df = non_zero_df[['URL','Total outer links']].sort_values('Total outer links',ascending=Fal se).reset_index(drop=True)
top_df.index += 1 print(top_df.head(10)) all_base_link_counts = []
for site in link_counts_cats_df['Base outer links']:
for val in site[1].values():
all_base_link_counts.append(val)
ticks = [1,2,3,5,10,20,50,100,1000,10000,100000]
links_counts_df = pd.DataFrame(data=all_base_link_counts,columns=['count']) sns.set_style('darkgrid')
sns.set(font_scale = 1.3)
fig, ax =plt.subplots(2,1,sharex=True)
p1 = sns.boxplot(data=links_counts_df, x='count',palette='rocket',ax=ax[0]) p1.set(xscale='log')
115
p1.set(xticks=ticks,xticklabels=ticks) p1.set(xlabel='None')
p2 = sns.stripplot(data=links_counts_df, x='count',palette='rocket',ax=ax[1],jitter=0.3) p2.set(xscale='log')
p2.set(xticks=ticks,xticklabels=ticks)
p2.set(xlabel='Received links per base outer link, including duplicates (n={})'.format(len(link s_counts_df['count'])))
plt.subplots_adjust(wspace=0, hspace=0) plt.gcf().set_size_inches(20, 4)
print(links_counts_df['count'].describe()) all_base_link_counts = []
for site in link_counts_cats_df.loc[link_counts_cats_df['URL']=='http://www.christianarchy.nl']
['Base outer links']:
for val in site[1].values():
all_base_link_counts.append(val)
ticks = [1,2,3,5,10,20,50,100,1000,10000,100000]
links_counts_df = pd.DataFrame(data=all_base_link_counts,columns=['count']) sns.set_style('darkgrid')
sns.set(font_scale = 1.3)
fig, ax =plt.subplots(2,1,sharex=True)
p1 = sns.boxplot(data=links_counts_df, x='count',palette='rocket',ax=ax[0]) p1.set(xscale='log')
p1.set(xticks=ticks,xticklabels=ticks) p1.set(xlabel='None')
p2 = sns.stripplot(data=links_counts_df, x='count',palette='rocket',ax=ax[1],jitter=0.3) p2.set(xscale='log')
p2.set(xticks=ticks,xticklabels=ticks)
p2.set(xlabel='christianarchy.nl: received links per base outer link(n={})'.format(len(links_co unts_df['count'])))
plt.subplots_adjust(wspace=0, hspace=0) plt.gcf().set_size_inches(20, 4)
print(links_counts_df['count'].describe(percentiles=[.25, .5, .75,.90,.99])) non_zero_df = link_counts_cats_df[link_counts_cats_df['Unique inner links'] != 0]
non_zero_df = non_zero_df[non_zero_df['Unique outer links'] != 0]
plot_corr_sns = sns.regplot(x="Unique inner links", y="Unique outer links", color="r", data=non _zero_df)
plt.gcf().set_size_inches(15, 15) select = ['Christianity']
non_zero_df = link_counts_cats_df[link_counts_cats_df['Unique base outer links'] != 0]
non_zero_df = non_zero_df.loc[non_zero_df['Categories primary'].isin(select), :]
ticks = [1,5,10,25,50,100,250,500,1000,5000,17000]
plt.figure(figsize=(15,8)) # Set plot dimensions
p = sns.stripplot(x='Categories secondary', y='Unique base outer links',palette='viridis', data
=non_zero_df,color='grey')
p = sns.boxplot(x='Categories secondary', y='Unique base outer links',data=non_zero_df,color='w hite')
p.set(yscale='log') p.set(xlabel=None)
p.set(yticks=ticks,yticklabels=ticks)
select = ['Judaism','Islam','Hinduism','Buddhism','Secondary traditional religions']
non_zero_df = link_counts_cats_df[link_counts_cats_df['Unique base outer links'] != 0]
116
non_zero_df = non_zero_df.loc[non_zero_df['Categories secondary'].isin(select), :]
ticks = [1,5,10,25,50,100,250,500,1000]
plt.figure(figsize=(15,8)) # Set plot dimensions
p = sns.stripplot(x='Categories secondary', y='Unique base outer links',palette='viridis', data
=non_zero_df,color='grey')
p = sns.boxplot(x='Categories secondary', y='Unique base outer links',data=non_zero_df,color='w hite')
p.set(yscale='log') p.set(xlabel=None)
p.set(yticks=ticks,yticklabels=ticks)
select = ['New movements','Spirituality','Contemporary paganisms','Non-theism']
non_zero_df = link_counts_cats_df[link_counts_cats_df['Unique base outer links'] != 0]
non_zero_df = non_zero_df.loc[non_zero_df['Categories secondary'].isin(select), :]
ticks = [1,5,10,25,50,100,250,500,1000,1500]
plt.figure(figsize=(15,8)) # Set plot dimensions
p = sns.stripplot(x='Categories secondary', y='Unique base outer links',palette='viridis', data
=non_zero_df,color='grey')
p = sns.boxplot(x='Categories secondary', y='Unique base outer links',data=non_zero_df,color='w hite')
p.set(yscale='log') p.set(xlabel=None)
p.set(yticks=ticks,yticklabels=ticks) unique_links = []
nodes_tuples = []
edges_tuples = []
for index, row in link_counts_cats_df.iterrows():
nodes_tuples.append((index,row['URL'][row['URL'].find('//')+2:],row['Categories primary'],r ow['Categories secondary'],row['Categories tetriary']))
unique_links.append(row['URL'][row['URL'].find('//')+2:])
for log in logs:
current_id = nodes_tuples[-1][0]+1
with open(log,encoding='latin-1') as infile:
log = infile.read()
raw_link_level = ast.literal_eval(log.split('\n\n')[5].strip('Raw link levels: ')) seed_link = raw_link_level[0]['inner'][0].strip('/')
if seed_link in list(link_counts_cats_df['URL']):
raw_base_outer = ast.literal_eval(log.split('\n\n')[4].strip('Raw base outer links: ')) for link in raw_base_outer.keys():
if link not in unique_links:
unique_links.append(link)
nodes_tuples.append((current_id,link,'outer','outer')) current_id+=1
nodes_df = pd.DataFrame(nodes_tuples, columns =['Id', 'URL', 'Category primary','Category secon dary','Category tetriary'])
belongs_to_dataset_list = []
for row in nodes_df['Category primary']:
if row != 'outer':
belongs_to_dataset_list.append(1) else:
belongs_to_dataset_list.append(0)
nodes_df.insert(5,'Part of dataset',belongs_to_dataset_list, True)
for log in logs:
with open(log,encoding='latin-1') as infile:
log = infile.read()
117
raw_link_level = ast.literal_eval(log.split('\n\n')[5].strip('Raw link levels: ')) seed_link = raw_link_level[0]['inner'][0].strip('/')
if seed_link in list(link_counts_cats_df['URL']):
print(seed_link)
seed_link = seed_link[seed_link.find('//')+2:]
seed_id = nodes_df[nodes_df['URL'] == seed_link]['Id'].values[0]
raw_base_outer = ast.literal_eval(log.split('\n\n')[4].strip('Raw base outer links:
'))
for link, weight in raw_base_outer.items():
current_id = nodes_df[nodes_df['URL'] == link]['Id'].values[0]
edges_tuples.append((seed_id,current_id,'Directed',weight))
edges_df = pd.DataFrame(edges_tuples, columns =['Source', 'Target', 'Type','Weight'])
#Seperate script for filtering out all ties between the corpus sites
unique_links = []
nodes_tuples = []
edges_tuples = []
no_http_corpus_link = [link[link.find('//')+2:] for link in link_counts_cats_df['URL']]
for index, row in link_counts_cats_df.iterrows():
nodes_tuples.append((index,row['URL'][row['URL'].find('//')+2:],row['Categories primary'],r ow['Categories secondary'],row['Categories tetriary']))
unique_links.append(row['URL'][row['URL'].find('//')+2:])
for log in logs:
current_id = nodes_tuples[-1][0]+1
with open(log,encoding='latin-1') as infile:
log = infile.read()
seed_link = raw_link_level[0]['inner'][0].strip('/') if seed_link in list(link_counts_cats_df['URL']):
raw_base_outer = ast.literal_eval(log.split('\n\n')[4].strip('Raw base outer links: ')) for link in raw_base_outer.keys():
if link not in unique_links:
unique_links.append(link)
nodes_tuples.append((current_id,link,'outer','outer')) current_id+=1
nodes_df_no_corpus = pd.DataFrame(nodes_tuples, columns =['Id', 'URL', 'Category primary','Cate gory secondary','Category tetriary'])
belongs_to_dataset_list = []
for row in nodes_df_no_corpus['Category primary']:
if row != 'outer':
belongs_to_dataset_list.append(1) else:
belongs_to_dataset_list.append(0)
nodes_df_no_corpus.insert(5,'Part of dataset',belongs_to_dataset_list, True)
for log in logs:
with open(log,encoding='latin-1') as infile:
log = infile.read()
raw_link_level = ast.literal_eval(log.split('\n\n')[5].strip('Raw link levels: ')) seed_link = raw_link_level[0]['inner'][0].strip('/')
if seed_link in list(link_counts_cats_df['URL']):
print(seed_link)
seed_link = seed_link[seed_link.find('//')+2:]
118
seed_id = nodes_df_no_corpus[nodes_df_no_corpus['URL'] == seed_link]['Id'].values[0]
raw_base_outer = ast.literal_eval(log.split('\n\n')[4].strip('Raw base outer links:
'))
for link, weight in raw_base_outer.items():
if link not in no_http_corpus_link:
current_id = nodes_df_no_corpus[nodes_df_no_corpus['URL'] == link]['Id'].va lues[0]
edges_tuples.append((seed_id,current_id,'Directed',weight))
edges_df_no_corpus = pd.DataFrame(edges_tuples, columns =['Source', 'Target', 'Type','Weight']) nodes_df_no_corpus.to_csv(r'C:\Users\gebruiker\Documents\Thesis_2022\nodes_df_nocorpus.csv',ind ex=False)
edges_df_no_corpus.to_csv(r'C:\Users\gebruiker\Documents\Thesis_2022\edges_df_nocorpus.csv',ind ex=False)
nodes_df.to_csv(r'C:\Users\gebruiker\Documents\Thesis_2022\nodes_df.csv',index=False) edges_df.to_csv(r'C:\Users\gebruiker\Documents\Thesis_2022\edges_df.csv',index=False)