from IPython.display import HTML
HTML('''<button type="button" class="btn btn-outline-danger"  onclick="codeToggle();">Toggle Code</button>''')


       
        import warnings
warnings.filterwarnings('ignore')


       
        import os
import random
import re
import string
import tabulate
from collections import Counter

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from matplotlib import rc
rc('animation', html='jshtml')


       
        import seaborn as sns

import re, os, string
from sklearn.feature_extraction.text import TfidfVectorizer

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import Word2Vec

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
nltk.download("stopwords")

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.decomposition import PCA

import networkx as nx


       
        import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
SEED = 42
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)


       
        import plotly
plotly.offline.init_notebook_mode()


       
        papers_df = pd.read_csv('data/new_papers.csv')
papers_df.head()


       
        # Visualising the data

fig = px.scatter_3d(papers_df.sample(n=100), x='Year', y='Conference', z='Author',
              color='Affiliation', template="plotly_dark")
# fig.update_layout(margin={"r":0,"t":10,"l":00,"b":100})
fig.show()


       
        # get the list of the top 10 universities till now
dict_ = {}
for insti in papers_df['Affiliation']:
    insti = str(insti)
    if insti == 'None':
        continue
    if insti not in dict_.keys():
        dict_[insti] = (papers_df.Affiliation == insti).sum()

sorted_by_value=(sorted(dict_.items(), key=lambda item: item[1], reverse=True))
top_10_institutes = []
for tuple_ in sorted_by_value[:10]:
    top_10_institutes.append(tuple_[0])


       
        # get a dictionary which outputs the list of number of papers published by top 10 universities in a year x:
y_paper_count_top_10 = {}

for year in range(2006,2022, 1):
    y_paper_count_top_10[year] = []
    temp_df = papers_df[papers_df.Year == year]
    for insti in top_10_institutes:
        count = (temp_df.Affiliation==insti).sum()
        y_paper_count_top_10[year].append(count)


       
        # Now we need to plot the growth on a bar chart for the institutes
# we will have 8 points : 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020, 2021

import plotly.graph_objects as go

# Create figure
fig = go.Figure()

# Add traces, one for each slider step
for step in np.arange(2006, 2021, 1):
    fig.add_trace(
        go.Bar(
            visible=False,
            # line=dict(color="#00CED1", width=6),
            # name="𝜈 = " + str(step),
            x = top_10_institutes,
            y = y_paper_count_top_10[step],
            name = "Year="+str(step),
            
            # color = 'rgb(255,0,0)',
           ))

# Make 10th trace visible
fig.data[0].visible = True

# Create and add slider
steps = []
for i in range(len(fig.data)):
    step = dict(
        method="update",
        args=[{"visible": [False] * len(fig.data)},
              {"title": "Slider switched to Year: " + str(2006+i)}],  # layout attribute
        label = str(2006+i*1),
    )
    step["args"][0]["visible"][i] = True  # Toggle i'th trace to "visible"
    steps.append(step)

sliders = [dict(
    active=0,
    currentvalue={"prefix": "Trend For Year: ", },
    pad={"t": 80},
    steps=steps
)]

fig.update_layout(
    sliders=sliders
)

fig.show()


       
        papers_df


       
        # Trend for author count every year
# First we get author list for one year, then we see if it was in the past year or not.

# Assumption is that once we seen an author has published in some year, then that author remains active subsequent years.
# This trend is increasing (either constant or increases)

# List of year in dataset:
year_list = sorted(list(set(papers_df["Year"])))

dict_year_author = {}
dict_year_author_cumu = {}

for year in year_list:
    dict_year_author[str(year)] = list(set(papers_df.loc[papers_df["Year"]==year]["Author"]))

dict_year_author_cumu[str(year_list[0])] = dict_year_author[str(year_list[0])]

for i in range(1,len(year_list)):
    key_ = str(year_list[i])
    prev_key = str(year_list[i-1])
    dict_year_author_cumu[str(key_)] = list(set(dict_year_author[key_] + dict_year_author_cumu[prev_key]))


       
        # Plotting trend for Authors publishing every year:
x_author = list(dict_year_author.keys())
y_author = [len(dict_year_author[key_]) for key_ in x_author]
fig = px.bar(x=x_author, y =y_author, title= "Number of Authors Publishing Each Year", color = y_author,labels={"y" :"Number of Authors", "x": "Year"})
fig.show()


       
        # Plotting trend for Active authors added new every year:
x_author_cumu = list(dict_year_author_cumu.keys())
y_author_cumu = [len(dict_year_author_cumu[key_]) for key_ in x_author_cumu]
fig = px.bar(x=x_author_cumu, y =y_author_cumu, title= "Number of Authors Active Each Year (Cummulative)", color = y_author_cumu,
            labels={"y" :"Number of Unique Authors", "x": "Year"}
            )
fig.show()


       
        # New authors added each year
new_author_year = [y_author_cumu[0]]

for i in range(1,len(y_author_cumu)):
    new_author_year.append(y_author_cumu[i]-y_author_cumu[i-1])

fig = px.bar(x=x_author_cumu, y =new_author_year, title= "Number of Authors Added In a Year", color = new_author_year,
            labels={"y" :"Number of New Authors", "x": "Year"}
            )
fig.show()


       
        papers = pd.read_csv('data/new_papers.csv')
papers = papers.drop_duplicates()

ath = []
val = []
for name, pub in papers.Author.value_counts().head(10).iteritems():
    ath.append(name)
    val.append(pub)

df = pd.DataFrame(list(zip(ath, val)),columns =['Authors', 'Papers'])

fig = px.bar(df, x="Authors", y="Papers", color='Authors')
fig.update_layout(title_text = "Authors publishing maximum number of papers",title_x=0.5)
fig.show()


       
        author_counts = papers.value_counts('Author').rename_axis('author_name').reset_index(name='paper_count')

fig = px.histogram(author_counts, x="paper_count", nbins = 60, color='paper_count', log_y=True, 
                            labels={"y": "Frequency (log scale)", "paper_count": "Paper Count"}
                    )
fig.update_layout(title_text = "Plot of total papers published by authors vs frequency (Log Scale)",title_x=0.5)
fig.show()


       
        # Now we try to answer the question in top 5% author, how many of them collab with each other
top_author_collab = pd.read_csv('data/new_papers.csv')
top_author_collab = top_author_collab.drop_duplicates()

author = []
value = []
# for name, pub in top_author_collab.Author.value_counts().head(len(top_author_collab)//100).iteritems():
for name, pub in top_author_collab.Author.value_counts().head(50).iteritems():
    author.append(name)
    value.append(pub)

top_author_collab = top_author_collab.loc[top_author_collab["Author"].isin(author)].copy()


       
        top_author_collab.sample(5)


       
        new_df = top_author_collab[['Conference','Year','Title', 'Author']].groupby(['Conference','Year','Title'])


       
        Authors_collab = []
key_seen = []

for i in top_author_collab.index:
    curr_key = tuple(top_author_collab.loc[i,['Conference','Year','Title']])
    if curr_key not in key_seen:
        key_seen.append(curr_key)
        lst_author = list(new_df.get_group(curr_key).Author)
        lst_author.sort()
        if lst_author not in Authors_collab:
            Authors_collab.append(lst_author)


       
        # Author collab stores all the edges
node_list = list(set(top_author_collab.Author))
edge_list = [i for i in Authors_collab if len(i)>=2]


       
        # creating the graph

import networkx as nx
import matplotlib.pyplot as plt
import itertools
from pyvis.network import Network
# creating notebook
net = Network(notebook = False)


       
        class GraphVisualization:
       
    def __init__(self):
        G = None
        self.visual = []
          
    # addEdge function inputs the vertices of an
    # edge and appends it to the visual list
    def addEdges(self, list_edges):
        to_add = list(itertools.permutations(list_edges, 2))
        for i in to_add:
            self.visual.append(list(i))
          
    # In visualize function G is an object of
    # class Graph given by networkx G.add_edges_from(visual)
    # creates a graph with a given list
    # nx.draw_networkx(G) - plots the graph
    # plt.show() - displays the graph
    def visualize(self):
        self.G = nx.Graph()
        self.G.add_edges_from(self.visual)
        # nx.draw_spring(G)
        pos = nx.random_layout(self.G)
        nx.draw(self.G, pos , with_labels = True, width=0.4, 
            node_color='lightblue', node_size=200)
        plt.figure(figsize=(13,13), dpi=300)
    
    def getGraphObj(self):
        self.G = nx.Graph()
        self.G.add_edges_from(self.visual)
        return self.G


       
        Graph = GraphVisualization()
for i in edge_list:
    Graph.addEdges(i)


       
        ## VISUALISING USING PYVIS
nx_graph = Graph.getGraphObj()


net.from_nx(nx_graph, show_edge_weights=False)
net.width = '1000px'
net.show("../_html/graph.html")


       
        from IPython.display import HTML
HTML('''    <button > <a style = "text-decoration: none; font-weight: 800;" href="https://ai-ml-growth-data-analysis.netlify.app/graph.html" target="_blank">SHOW COLLABORATION NETWORK </a></button>
''')


       
        print("The author having maximum unqiue collaboration is: ", sorted(Graph.G.degree, key=lambda x: x[1], reverse=True)[0][0], end="")
print(" with ",sorted(Graph.G.degree, key=lambda x: x[1], reverse=True)[0][1], " collaborations")

The author having maximum unqiue collaboration is:  Le Song with  8  collaborations


       
        PUNCTUATION = """!"#$%&'()*+,-./:;<=>?@[\]^_ *{|}~""" 
TOP_K_KEYWORDS = 10 # top k number of keywords to retrieve in a ranked document
STOPWORD_PATH = 'data/stopwords_ls.txt'
PAPERS_PATH = 'data/papers.csv'


       
        def get_sw_lst(path):
    
    with open(path,'r') as f:

        listl=[]
        for line in f:
            strip_lines=line.strip()
            listli=strip_lines.split()
            listl += listli
            
        return listl


       
        def clean_text(text):
    """Doc cleaning"""
    
    # Lowering text
    text = text.lower()
    
    # Removing punctuation
    text = "".join([c for c in text if c not in PUNCTUATION])
    
    # Removing whitespace and newlines
    text = re.sub('\s+',' ',text)
    
    return text


       
        def sort_coo(coo_matrix):
    """Sort a dict with highest score"""
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature, score
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

def get_keywords(vectorizer, feature_names, doc):
    """Return top k keywords from a doc using TF-IDF method"""

    #generate tf-idf for the given document
    tf_idf_vector = vectorizer.transform([doc])
    
    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())

    #extract only TOP_K_KEYWORDS
    keywords=extract_topn_from_vector(feature_names,sorted_items,TOP_K_KEYWORDS)
    
    return list(keywords.keys())


       
        data = pd.read_csv(PAPERS_PATH)
data.head()


       
        data.dropna(subset=['full_text'], inplace=True)
data['full_text'] = data['full_text'].apply(clean_text)

data.head()


       
        test = data.groupby('year').sample(n= 25, random_state=25)
train = data.drop(test.index, axis = 0)

corpora = train['full_text'].to_list()

# get the stop words list
stopwords=get_sw_lst(STOPWORD_PATH)

# Initializing TF-IDF Vectorizer with the given stopword list
vectorizer = TfidfVectorizer(stop_words=stopwords, smooth_idf=True, use_idf=True)

# Creating vocabulary with our corpora
vectorizer.fit_transform(corpora)

# Storing the created vocabulary
feature_names = vectorizer.get_feature_names()


       
        result = []
i = 0
test_lst = test['full_text'].to_list()
yrs = test.year.to_list()

word_data = {}

# Associating the top 10 keywords from each paper and their years
for doc in test_lst:
    df = {}
    df['full_text'] = doc
    df['year'] = yrs[i]
    wordsl = get_keywords(vectorizer, feature_names, doc)
    df['top_keywords'] = wordsl
    result.append(df)

    if(yrs[i] in word_data):
        word_data[yrs[i]] += wordsl
    else:
        word_data[yrs[i]] = wordsl

    i = i+1
    
final = pd.DataFrame(result)


       
        # importing all necessary modules
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt


       
         
for key,val in word_data.items():
    words_string=(" ").join(val)
    wordcloud = WordCloud(width = 1000, height = 500).generate(words_string)
    plt.figure(figsize=(15,8))
    plt.imshow(wordcloud)
    # plt.title('Word Cloud for the year {}'.format(str(i)))
    plt.axis("off")
    plt.savefig("assets/wc_{}.png".format(key), bbox_inches='tight')
    # plt.show()
    plt.close()


       
        import imageio
import matplotlib.image as mpimg
import matplotlib.pyplot as plt


       
        for i in range(1987,2020):

    x = 'assets/wc_{}.png'.format(i)
    img = mpimg.imread(x) 

    plt.figure(figsize=(15,8), facecolor='black')
    # plt.imshow(img)
    plt.axis("off")
    plt.title('Word Cloud for the year {}'.format(str(i)), color = 'white')
    plt.imshow(img)
    plt.savefig("assets/NTwc_{}.png".format(i), bbox_inches='tight')


       
        images = {}
for year in range(1987, 2020):
    x = './assets/NTwc_{}.png'.format(year)
    images[x] = str(year)


       
        new_images = []
for filename in list(images.keys()):
    new_images.append(imageio.imread(filename))

imageio.mimsave('./assets/gif/movie.gif', new_images, duration = 2)


       
        from IPython.display import Image
with open('./assets/gif/movie.gif','rb') as f:
    display(Image(data=f.read(), format='png'))


       
        df = pd.read_csv('./data/new_papers.csv')
df = df.drop_duplicates()
print(df.shape)
df.head()

(49296, 5)


       
        data = df['Title'].explode().unique()
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]
tagged_data[0]

TaggedDocument(words=['attentional', 'processing', 'on', 'a', 'spike-based', 'vlsi', 'neural', 'network'], tags=['0'])


       
        max_epochs = 100
vec_size = 20
alpha = 0.025

model = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm = 0)
  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha


       
        model.save("./assets/d2v.model")
print("Model Saved")
temp = model


       
        model= Doc2Vec.load("./assets/d2v.model")
title_embeddings = []
for i in range(len(model.dv)):
  temp = model.dv[i]
  title_embeddings.append(temp)
title_embeddings = np.array(title_embeddings)


       
        def create_node_trace(G):
    # collect node information from G to plot
    node_x = []
    node_y = []
    node_text = []
    node_color = []

    for i, node in enumerate(G.nodes(data=True)):
        # get node x,y position and store
        x, y = node[1]['pos']
        node_x.append(x)
        node_y.append(y)

        node_text.append(node[1]['text'])
        node_color.append(node[1]['color'])

    # create node trace (i.e., scatter plot)
    # make it invisible by default
    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers',
        hoverinfo='text',
        marker=dict(
            showscale=False,
            color=node_color,
            size=16,
            line_width=0.5,
        ),
        text=node_text,
        visible=False
    )

    return node_trace


       
        def create_edge_trace(G):
    # collect edges information from G to plot
    edge_weight = []
    edge_text = []
    edge_pos = []
    edge_color = []
    
    for edge in G.edges(data=True):
        
        # edge is line connecting two points
        x0, y0 = G.nodes[edge[0]]['pos']
        x1, y1 = G.nodes[edge[1]]['pos']
        edge_pos.append([[x0, x1, None], [y0, y1, None]])
        
        # edge line color when drawn
        edge_color.append("black")

    # there is a trace for each edge
    edge_traces = []
    for i in range(len(edge_pos)):
        
        # edge line width
        line_width = 1

        # is scatter because it is line connecting two points
        trace = go.Scatter(
            x=edge_pos[i][0], y=edge_pos[i][1],
            line=dict(width=line_width, color=edge_color[i]),
            mode='lines',
            visible=False
        )
        edge_traces.append(trace)

    return edge_traces


       
        def filter_similarity_matrix_at_step(square_matrix, step_value):
    # copy matrix
    aux = square_matrix.copy()
    
    # set as NaN all values equal to or below threshold value
    aux[aux <= step_value] = np.nan
    
    # return filtered matrix
    return aux


       
        def get_interactive_slider_similarity_graph(square_matrix, slider_values, node_text=None, yaxisrange=None, xaxisrange=None):
    
    # Create figure with plotly
    fig = go.Figure()

    # key: slider value
    # value: list of traces to display for that slider value
    slider_dict = {}
    
    # total number of traces
    total_n_traces = 0
    
    # node positions on plot
    #node_pos = None

    # for each possible value in the slider, create and store traces (i.e., plots)
    for i, step_value in enumerate(slider_values):

        # update similarity matrix for the current step
        aux = filter_similarity_matrix_at_step(square_matrix, step_value)

        # create nx graph from sim matrix
        G = nx.to_networkx_graph(aux)
        
        # remove edges for 0 weight (NaN)
        G.remove_edges_from([(a, b) for a, b, attrs in G.edges(data=True) if np.isnan(attrs["weight"])])

        # assign node positions if None
        node_pos = nx.nx_pydot.graphviz_layout(G)

        # populate nodes with meta information
        for node in G.nodes(data=True):
            
            # node position
            node[1]['pos'] = node_pos[node[0]]

            # node color
            node[1]['color'] = "orange"

            # node text on hover if any is specified else is empty
            if node_text is not None:
                node[1]['text'] = node_text[node[0]]
            else:
                node[1]['text'] = ""

        # create edge taces (each edge is a trace, thus this is a list)
        edge_traces = create_edge_trace(G)
        
        # create node trace (a single trace for all nodes, thus it is not a list)
        node_trace = create_node_trace(G) 

        # store edge+node traces as single list for the current step value
        slider_dict[step_value] = edge_traces + [node_trace]
        
        # keep count of the total number of traces
        total_n_traces += len(slider_dict[step_value])

        # make sure that the first slider value is active for visualization
        if i == 0:
            for trace in slider_dict[step_value]:
                # make visible
                trace.visible = True

                
    # Create steps objects (one step per step_value)
    steps = []
    for step_value in slider_values:
        
        # count traces before adding new traces
        n_traces_before_adding_new = len(fig.data)
        
        # add new traces
        fig.add_traces(slider_dict[step_value])

        step = dict(
            # update figure when this step is active
            method="update",
            # make all traces invisible
            args=[{"visible": [False] * total_n_traces}],
            # label on the slider
            label=str(round(step_value, 3)),
        )

        # only toggle this step's traces visible, others remain invisible
        n_traces_for_step_value = len(slider_dict[step_value])
        for i in range(n_traces_before_adding_new, n_traces_before_adding_new + n_traces_for_step_value):
            step["args"][0]["visible"][i] = True
        
        # store step object in list of many steps
        steps.append(step)

    # create slider with list of step objects
    slider = [dict(
        active=0,
        steps=steps
    )]

    # add slider to figure and create layout
    fig.update_layout(
        sliders=slider,
        showlegend=False,
        hovermode='closest',
        margin=dict(b=20, l=5, r=5, t=40),
        xaxis=dict(range=xaxisrange, showgrid=False, zeroline=False, showticklabels=False),
        yaxis=dict(range=yaxisrange, showgrid=False, zeroline=False, showticklabels=False),
        width=700, height=700,
    )

    return fig


       
        def Rand(start, end, num):
    res = []
 
    for j in range(num):
        res.append(random.randint(start, end))
 
    return res


       
        sample_index = Rand(0, len(title_embeddings), 50)
sample_title_embeddings = [title_embeddings[i] for i in sample_index]
sample_data = [data[i] for i in sample_index]


       
        similarity_matrix = cosine_similarity(sample_title_embeddings)
print(similarity_matrix.shape)

(50, 50)


       
        # define slider steps (i.e., threshold values)
slider_steps = np.arange(0.4, 0.85, 0.05)
    
# get the slider figure
fig = get_interactive_slider_similarity_graph(
    similarity_matrix,
    slider_steps,
    node_text = sample_data
)

fig.update_layout(title_text='Similarity between 50 random samples.', title_x=0.5)
# plot it
fig.show()


       
        def clean_text(text, tokenizer, stopwords):
    """Pre-process text and generate tokens

    Args:
        text: Text to tokenize.

    Returns:
        Tokenized text.
    """
    text = str(text).lower()  # Lowercase words
    text = re.sub(r"\[(.*?)\]", "", text)  # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", "", text)  # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)  # Replace dash between words
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", "", text
    )  # Remove punctuation

    tokens = tokenizer(text)  # Get tokens from text
    tokens = [t for t in tokens if not t in stopwords]  # Remove stopwords
    tokens = ["" if t.isdigit() else t for t in tokens]  # Remove digits
    tokens = [t for t in tokens if len(t) > 1]  # Remove short tokens
    return tokens


       
        from nltk.corpus import stopwords


       
        df_raw = pd.read_csv('./data/new_papers.csv')
df_raw = df_raw.drop_duplicates()

custom_stopwords = set(stopwords.words("english"))
text_columns = ["Title"]

df = df_raw.copy()

for col in text_columns:
    df[col] = df[col].astype(str)

# Create text column based on title, description, and content
df["text"] = df[text_columns].apply(lambda x: " | ".join(x), axis=1)
df["tokens"] = df["text"].map(lambda x: clean_text(x, word_tokenize, custom_stopwords))

# Remove duplicated after preprocessing
_, idx = np.unique(df["tokens"], return_index=True)
df = df.iloc[idx, :]

# Remove empty values and keep relevant columns
df = df.loc[df.tokens.map(lambda x: len(x) > 0), ["text", "tokens", "Year", "Conference"]]

print(f"Original dataframe: {df_raw.shape}")
print(f"Pre-processed dataframe: {df.shape}")


       
        docs = df["text"].values
tokenized_docs = df["tokens"].values


       
        model = Word2Vec(sentences=tokenized_docs, vector_size=100, workers=1, seed=SEED)


       
        def vectorize(list_of_docs, model):
    """Generate vectors for list of documents using a Word Embedding

    Args:
        list_of_docs: List of documents
        model: Gensim's Word Embedding

    Returns:
        List of document vectors
    """
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features
    
vectorized_docs = vectorize(tokenized_docs, model=model)
len(vectorized_docs), len(vectorized_docs[0])


       
        def mbkmeans_clusters(
	X, 
    k, 
    mb, 
    print_silhouette_values, 
):
    """Generate clusters and print Silhouette metrics using MBKmeans

    Args:
        X: Matrix of features.
        k: Number of clusters.
        mb: Size of mini-batches.
        print_silhouette_values: Print silhouette values per cluster.

    Returns:
        Trained clustering model and labels based on X.
    """
    km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
    print(f"For n_clusters = {k}")
    print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
    print(f"Inertia:{km.inertia_}")

    if print_silhouette_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)
        print(f"Silhouette values:")
        silhouette_values = []
        for i in range(k):
            cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
            silhouette_values.append(
                (
                    i,
                    cluster_silhouette_values.shape[0],
                    cluster_silhouette_values.mean(),
                    cluster_silhouette_values.min(),
                    cluster_silhouette_values.max(),
                )
            )
        silhouette_values = sorted(
            silhouette_values, key=lambda tup: tup[2], reverse=True
        )
        for s in silhouette_values:
            print(
                f"    Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
            )
    return km, km.labels_


       
        clustering, cluster_labels = mbkmeans_clusters(
	X=vectorized_docs,
    k=20,
    mb=500,
    print_silhouette_values=False,
)


       
        for test_cluster in range(20):
    most_representative_docs = np.argsort(np.linalg.norm(vectorized_docs - clustering.cluster_centers_[test_cluster], axis=1))
    print("Cluster", test_cluster, ":",docs[most_representative_docs[0]])

Cluster 0 : Learning Disentangled Representations and Group Structure of Dynamical Environments
Cluster 1 : Two Generator Game: Learning to Sample via Linear Goodness-of-Fit Test
Cluster 2 : Statistical Optimality of Stochastic Gradient Descent on Hard Learning Problems through Multiple Passes
Cluster 3 : A Gaussian Process Model of Quasar Spectral Energy Distributions
Cluster 4 : Extending position/phase-shift tuning to motion energy neurons improves velocity discrimination
Cluster 5 : Analog readout for optical reservoir computers
Cluster 6 : HAWQ-V2: Hessian Aware trace-Weighted Quantization of Neural Networks
Cluster 7 : Design Space for Graph Neural Networks
Cluster 8 : Learning Multiple Tasks using Manifold Regularization
Cluster 9 : GIANT: Globally Improved Approximate Newton Method for Distributed Optimization
Cluster 10 : Towards Image Understanding from Deep Compression Without Decoding
Cluster 11 : Fixed-Length Poisson MRF: Adding Dependencies to the Multinomial
Cluster 12 : Boosting First-Order Methods by Shifting Objective: New Schemes with Faster Worst-Case Rates
Cluster 13 : Coresets for Robust Training of Deep Neural Networks against Noisy Labels
Cluster 14 : Active Learning for Probabilistic Hypotheses Using the Maximum Gibbs Error Criterion
Cluster 15 : Manifold-regression to predict from MEG/EEG brain signals without source modeling
Cluster 16 : Predicting Scene Parsing and Motion Dynamics in the Future
Cluster 17 : Learning Accurate Entropy Model with Global Reference for Image Compression
Cluster 18 : Selective Sampling-based Scalable Sparse Subspace Clustering
Cluster 19 : Approximation Guarantees of Local Search Algorithms via Localizability of Set Functions


       
        df_clusters = pd.DataFrame({
    "text": docs,
    "tokens": [" ".join(text) for text in tokenized_docs],
    "cluster": cluster_labels,
    "vectorized_docs": vectorized_docs,
    "year": list(df['Year']),
    "conference": list(df['Conference'])
})


       
        df_clusters.head()


       
        def Extract(lst, i):
    return [item[i] for item in lst]


       
        pca = PCA(2)
#Transform the data
pca_vectorized_docs = pca.fit_transform(vectorized_docs)
df_clusters['pca_vectorized_docs'] = list(pca_vectorized_docs)
df_clusters['axis0'] = Extract(df_clusters['pca_vectorized_docs'], 0)
df_clusters['axis1'] = Extract(df_clusters['pca_vectorized_docs'], 1)


       
        # Getting unique labels
u_labels = np.unique(df_clusters['cluster'])

# plotting the results:

f = plt.figure()
f.patch.set_facecolor('white')
f.set_figwidth(10)
f.set_figheight(10)

for i in u_labels:
    plt.scatter(Extract(df_clusters['pca_vectorized_docs'], 0), Extract(df_clusters['pca_vectorized_docs'],1) , c = df_clusters['cluster'])
plt.title('Plotting the title vectors colored based on their clustering.')
plt.xlabel('pca axis0')
plt.ylabel('pca axis1')
plt.show()


       
        cluster_terms_dict = {}
for i in range(20):
    tokens_per_cluster = ""
    most_representative = model.wv.most_similar(positive=[clustering.cluster_centers_[i]], topn=5)
    for t in most_representative:
        tokens_per_cluster += f"{t[0]} "
    # print(f"Cluster {i}: {tokens_per_cluster}")
    cluster_terms_dict[i] = tokens_per_cluster

df_clusters['cluster_terms'] = [0]*df_clusters.shape[0]
for index, row in df_clusters.iterrows():
    df_clusters['cluster_terms'][index] = cluster_terms_dict[row['cluster']]


       
        new_df_clusters = df_clusters.drop_duplicates(subset=['year', 'cluster'], keep='first', inplace=False)
new_df_clusters = new_df_clusters.sort_values(by=['cluster', 'year'])
new_df_clusters = new_df_clusters[new_df_clusters['year'] != 2021]

fig = px.bar(new_df_clusters, x='year', y='cluster_count', animation_frame="cluster")
fig.show()


       
        data = [['Cluster number', 'Cluster terms']]
for i in  [1,4,5,6,7,15,18,19]:
    data.append([i, cluster_terms_dict[i]])

table = tabulate.tabulate(data, tablefmt='html')

from IPython.display import HTML, display
display(HTML(table))


       
        data = [['Cluster number', 'Cluster terms']]
for i in  [2,3,8,9,10,11,12,13,14,16,17]:
    data.append([i, cluster_terms_dict[i]])

table = tabulate.tabulate(data, tablefmt='html')

from IPython.display import HTML, display
display(HTML(table))


       
        df_clusters['cluster_count'] = [0]*df_clusters.shape[0]

for curr_year in range(2006, 2022):
    temp_df = df_clusters[df_clusters['year'] == curr_year]

    values = temp_df['cluster'].value_counts(dropna=False).keys().tolist()
    counts = [str(x) for x in temp_df['cluster'].value_counts(dropna=False).tolist()]
    cluster_count_dict  = dict(zip(values, counts))
    cluster_count_dict  = dict(sorted(cluster_count_dict.items()))

    for index, row in df_clusters.iterrows():
        if(df_clusters['year'][index] == curr_year):
            df_clusters['cluster_count'][index] = cluster_count_dict[row['cluster']]

df_clusters['cluster_count'] = pd.to_numeric(df_clusters['cluster_count'])


       
        df_clusters = pd.read_csv('./assets/df_clusters.csv')


       
        df_clusters = df_clusters.sort_values(by=['year'])


       
        pio.renderers.default = 'notebook_connected'

fig = px.scatter(df_clusters, x="axis0", y="axis1", animation_frame="year", animation_group="cluster_terms",
            color="cluster_terms", hover_name="text", size='cluster_count', size_max=55)

# fig["layout"].pop("updatemenus") # optional, drop animation buttons
fig.show()

	Conference	Year	Title	Author	Affiliation
0	NeurIPS	2006	Attentional Processing on a Spike-Based VLSI N...	Yingxue Wang	Swiss Federal Institute of Technology, Zurich
1	NeurIPS	2006	Attentional Processing on a Spike-Based VLSI N...	Rodney J Douglas	Institute of Neuroinformatics
2	NeurIPS	2006	Attentional Processing on a Spike-Based VLSI N...	Shih-Chii Liu	Institute for Neuroinformatics, University of ...
3	NeurIPS	2006	Multi-Task Feature Learning	Andreas Argyriou	Ecole Centrale de Paris
4	NeurIPS	2006	Multi-Task Feature Learning	Theos Evgeniou	INSEAD

	Conference	Year	Title	Author	Affiliation
0	NeurIPS	2006	Attentional Processing on a Spike-Based VLSI N...	Yingxue Wang	Swiss Federal Institute of Technology, Zurich
1	NeurIPS	2006	Attentional Processing on a Spike-Based VLSI N...	Rodney J Douglas	Institute of Neuroinformatics
2	NeurIPS	2006	Attentional Processing on a Spike-Based VLSI N...	Shih-Chii Liu	Institute for Neuroinformatics, University of ...
3	NeurIPS	2006	Multi-Task Feature Learning	Andreas Argyriou	Ecole Centrale de Paris
4	NeurIPS	2006	Multi-Task Feature Learning	Theos Evgeniou	INSEAD
...	...	...	...	...	...
49312	ICLR	2021	Self-supervised Representation Learning with R...	Martin Q Ma	School of Computer Science, Carnegie Mellon Un...
49313	ICLR	2021	Self-supervised Representation Learning with R...	Muqiao Yang	School of Computer Science, Carnegie Mellon Un...
49314	ICLR	2021	Self-supervised Representation Learning with R...	Han Zhao	University of Illinois, Urbana Champaign
49315	ICLR	2021	Self-supervised Representation Learning with R...	LP Morency	Carnegie Mellon University
49316	ICLR	2021	Self-supervised Representation Learning with R...	Ruslan Salakhutdinov	Carnegie-Mellon University

	Conference	Year	Title	Author	Affiliation
13815	NeurIPS	2017	Stochastic Approximation for Canonical Correla...	Nati Srebro	TTI-Chicago
23555	ICLR	2019	L-Shapley and C-Shapley: Efficient Model Inter...	Le Song	Ant Financial & Georgia Institute of Technology
40349	NeurIPS	2020	Robust Optimization for Fairness with Noisy Pr...	Michael Jordan	UC Berkeley
42210	NeurIPS	2020	Coresets via Bilevel Optimization for Continua...	Andreas Krause	ETH Zurich
7914	NeurIPS	2014	Recurrent Models of Visual Attention	Nicolas Heess	Google DeepMind

	source_id	year	title	abstract	full_text
0	27	1987	Bit-Serial Neural Networks	NaN	573 \n\nBIT - SERIAL NEURAL NETWORKS \n\nAlan...
1	63	1987	Connectivity Versus Entropy	NaN	1 \n\nCONNECTIVITY VERSUS ENTROPY \n\nYaser S...
2	60	1987	The Hopfield Model with Multi-Level Neurons	NaN	278 \n\nTHE HOPFIELD MODEL WITH MUL TI-LEVEL N...
3	59	1987	How Neural Nets Work	NaN	442 \n\nAlan Lapedes \nRobert Farber \n\nThe...
4	69	1987	Spatial Organization of Neural Networks: A Pro...	NaN	740 \n\nSPATIAL ORGANIZATION OF NEURAL NEn...

	source_id	year	title	abstract	full_text
0	27	1987	Bit-Serial Neural Networks	NaN	573 bitserialneuralnetworks alanfmurrayanthony...
1	63	1987	Connectivity Versus Entropy	NaN	1 connectivityversusentropy yasersabumostafa c...
2	60	1987	The Hopfield Model with Multi-Level Neurons	NaN	278 thehopfieldmodelwithmultilevelneurons mich...
3	59	1987	How Neural Nets Work	NaN	442 alanlapedes robertfarber theoreticaldivisi...
4	69	1987	Spatial Organization of Neural Networks: A Pro...	NaN	740 spatialorganizationofneuralnenorks aprobab...

Charting AI/ML Growth Across the World

Mahika Jaguste , IIT Gandhinagar, mahika.oj@iitgn.ac.in

Nipun Mahajan , IIT Gandhinagar, mahajan.n@iitgn.ac.in

Shrreya Singh , IIT Gandhinagar, singh.shrreya@iitgn.ac.in

Explaning The Datasets: ¶

Visualising The Data Points ¶

Collaboration Network of Top 50 Authors ¶

Analyzing research paper titles ¶

	text	tokens	cluster	vectorized_docs	year	conference
0	On 1/n neural representation and robustness	1n neural representation robustness	7	[0.056133837, -0.010953411, -0.18008822, 0.063...	2020	NeurIPS
1	Do 2D GANs Know 3D Shape? Unsupervised 3D Shap...	2d gans know 3d shape unsupervised 3d shape re...	14	[0.04935929, -0.04940794, -0.12340832, 0.04383...	2021	ICLR
2	3D-Aware Scene Manipulation via Inverse Graphics	3d aware scene manipulation via inverse graphics	14	[0.048220705, -0.045804773, -0.12345145, 0.046...	2018	NeurIPS
3	3D Gaze Concurrences from Head-mounted Cameras	3d gaze concurrences head mounted cameras	16	[0.037655167, -0.04429528, -0.09770333, 0.0377...	2012	NeurIPS
4	3D Multi-bodies: Fitting Sets of Plausible 3D ...	3d multi bodies fitting sets plausible 3d huma...	10	[0.061576527, -0.06371212, -0.14973295, 0.0626...	2020	NeurIPS

Cluster number	Cluster terms
1	application estimating without partial general
4	weight human dynamics scalable partial
5	application human single dependent estimating
6	network training graph attention equivariant
7	graph network recurrent training convolutional
15	scalable estimating application partial distributions
18	estimating beyond weighted general application
19	fast regularized faster randomized parallel

Cluster number	Cluster terms
2	accelerated convergence methods method coordinate
3	mixture variable dirichlet map mixtures
8	view approach knowledge embedding joint
9	algorithm method adaptive proximal problems
10	attention understanding aware uncertainty nets
11	single human gans application dynamics
12	faster parallel greedy fast regularized
13	network training graph generative recurrent
14	human scalable joint partial embedding
16	weight human dynamics uncertainty embedding
17	view semantic aware knowledge reasoning

Charting AI/ML Growth Across the World

Mahika Jaguste , IIT Gandhinagar, mahika.oj@iitgn.ac.in Nipun Mahajan , IIT Gandhinagar, mahajan.n@iitgn.ac.in Shrreya Singh , IIT Gandhinagar, singh.shrreya@iitgn.ac.in

Explaning The Datasets: ¶

Visualising The Data Points ¶

Collaboration Network of Top 50 Authors ¶

Analyzing research paper titles ¶

Mahika Jaguste , IIT Gandhinagar, mahika.oj@iitgn.ac.in

Nipun Mahajan , IIT Gandhinagar, mahajan.n@iitgn.ac.in

Shrreya Singh , IIT Gandhinagar, singh.shrreya@iitgn.ac.in