| 
							 | 
						 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						__author__ = 'Dmitry Ustalov' | 
					
					
						
						| 
							 | 
						__license__ = 'Apache 2.0' | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						import csv | 
					
					
						
						| 
							 | 
						import os | 
					
					
						
						| 
							 | 
						import re | 
					
					
						
						| 
							 | 
						import subprocess | 
					
					
						
						| 
							 | 
						from dataclasses import dataclass | 
					
					
						
						| 
							 | 
						from tempfile import NamedTemporaryFile | 
					
					
						
						| 
							 | 
						from typing import cast, BinaryIO, Optional | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						import gradio as gr | 
					
					
						
						| 
							 | 
						import matplotlib.pyplot as plt | 
					
					
						
						| 
							 | 
						import networkx as nx | 
					
					
						
						| 
							 | 
						import pandas as pd | 
					
					
						
						| 
							 | 
						from matplotlib.pyplot import Figure   | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						if 'MCL_BIN' in os.environ and os.path.isfile(os.environ['MCL_BIN']) and os.access(os.environ['MCL_BIN'], os.X_OK): | 
					
					
						
						| 
							 | 
						    MCL: Optional[str] = os.environ['MCL_BIN'] | 
					
					
						
						| 
							 | 
						else: | 
					
					
						
						| 
							 | 
						    MCL = None | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						@dataclass | 
					
					
						
						| 
							 | 
						class Algorithm: | 
					
					
						
						| 
							 | 
						    name: str | 
					
					
						
						| 
							 | 
						    mode: Optional[str] = None | 
					
					
						
						| 
							 | 
						    local_name: Optional[str] = None | 
					
					
						
						| 
							 | 
						    local_params: Optional[str] = None | 
					
					
						
						| 
							 | 
						    global_name: Optional[str] = None | 
					
					
						
						| 
							 | 
						    global_params: Optional[str] = None | 
					
					
						
						| 
							 | 
						    bin: Optional[str] = None | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    def args_clustering(self) -> list[str]: | 
					
					
						
						| 
							 | 
						        args = [self.name] | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        if self.mode: | 
					
					
						
						| 
							 | 
						            args.extend(['--mode', self.mode]) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        args.extend(self.args_graph()) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        if self.global_name: | 
					
					
						
						| 
							 | 
						            args.extend(['--global', self.global_name]) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        if self.global_params: | 
					
					
						
						| 
							 | 
						            args.extend(['--global-params', self.global_params]) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        if self.bin: | 
					
					
						
						| 
							 | 
						            args.extend(['--bin', self.bin]) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        return args | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    def args_graph(self) -> list[str]: | 
					
					
						
						| 
							 | 
						        args = [] | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        if self.local_name: | 
					
					
						
						| 
							 | 
						            args.extend(['--local', self.local_name]) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        if self.local_params: | 
					
					
						
						| 
							 | 
						            args.extend(['--local-params', self.local_params]) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        return args | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						ALGORITHMS: dict[str, Algorithm] = { | 
					
					
						
						| 
							 | 
						    'CW_top': Algorithm('cw', 'top'), | 
					
					
						
						| 
							 | 
						    'CW_lin': Algorithm('cw', 'lin'), | 
					
					
						
						| 
							 | 
						    'CW_log': Algorithm('cw', 'log'), | 
					
					
						
						| 
							 | 
						    'MaxMax': Algorithm('maxmax'), | 
					
					
						
						| 
							 | 
						    'Watset[CW_top, CW_top]': Algorithm('watset', None, 'cw', 'mode=top', 'cw', 'mode=top'), | 
					
					
						
						| 
							 | 
						    'Watset[CW_lin, CW_top]': Algorithm('watset', None, 'cw', 'mode=lin', 'cw', 'mode=top'), | 
					
					
						
						| 
							 | 
						    'Watset[CW_log, CW_top]': Algorithm('watset', None, 'cw', 'mode=log', 'cw', 'mode=top'), | 
					
					
						
						| 
							 | 
						    'Watset[MCL, CW_top]': Algorithm('watset', None, 'mcl', None, 'cw', 'mode=top'), | 
					
					
						
						| 
							 | 
						    'Watset[CW_top, CW_lin]': Algorithm('watset', None, 'cw', 'mode=top', 'cw', 'mode=lin'), | 
					
					
						
						| 
							 | 
						    'Watset[CW_lin, CW_lin]': Algorithm('watset', None, 'cw', 'mode=lin', 'cw', 'mode=lin'), | 
					
					
						
						| 
							 | 
						    'Watset[CW_log, CW_lin]': Algorithm('watset', None, 'cw', 'mode=log', 'cw', 'mode=lin'), | 
					
					
						
						| 
							 | 
						    'Watset[MCL, CW_lin]': Algorithm('watset', None, 'mcl', None, 'cw', 'mode=lin'), | 
					
					
						
						| 
							 | 
						    'Watset[CW_top, CW_log]': Algorithm('watset', None, 'cw', 'mode=top', 'cw', 'mode=log'), | 
					
					
						
						| 
							 | 
						    'Watset[CW_lin, CW_log]': Algorithm('watset', None, 'cw', 'mode=lin', 'cw', 'mode=log'), | 
					
					
						
						| 
							 | 
						    'Watset[CW_log, CW_log]': Algorithm('watset', None, 'cw', 'mode=log', 'cw', 'mode=log'), | 
					
					
						
						| 
							 | 
						    'Watset[MCL, CW_log]': Algorithm('watset', None, 'mcl', None, 'cw', 'mode=log'), | 
					
					
						
						| 
							 | 
						} | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						if MCL: | 
					
					
						
						| 
							 | 
						    ALGORITHMS.update({ | 
					
					
						
						| 
							 | 
						        'Watset[CW_top, MCL]': Algorithm('watset', None, 'cw', 'mode=top', 'mcl-bin', 'bin=' + MCL), | 
					
					
						
						| 
							 | 
						        'Watset[CW_lin, MCL]': Algorithm('watset', None, 'cw', 'mode=lin', 'mcl-bin', 'bin=' + MCL), | 
					
					
						
						| 
							 | 
						        'Watset[CW_log, MCL]': Algorithm('watset', None, 'cw', 'mode=log', 'mcl-bin', 'bin=' + MCL), | 
					
					
						
						| 
							 | 
						        'Watset[MCL, MCL]': Algorithm('watset', None, 'mcl', None, 'mcl-bin', 'bin=' + MCL), | 
					
					
						
						| 
							 | 
						        'MCL': Algorithm('mcl-bin', bin=MCL) | 
					
					
						
						| 
							 | 
						    }) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						SENSE = re.compile(r'^(?P<item>\d+)#(?P<sense>\d+)$') | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						def visualize(G: 'nx.Graph[str]', seed: int = 0) -> Figure: | 
					
					
						
						| 
							 | 
						    pos = nx.spring_layout(G, seed=seed) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    fig = plt.figure(dpi=240) | 
					
					
						
						| 
							 | 
						    plt.axis('off') | 
					
					
						
						| 
							 | 
						    nx.draw_networkx_edges(G, pos, alpha=.15) | 
					
					
						
						| 
							 | 
						    nx.draw_networkx_labels(G, pos) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    return fig | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						def watset(G: 'nx.Graph[str]', algorithm: str, seed: int = 0, | 
					
					
						
						| 
							 | 
						           jar: str = 'watset.jar', timeout: int = 10) -> tuple[pd.DataFrame, Optional['nx.Graph[str]']]: | 
					
					
						
						| 
							 | 
						    with (NamedTemporaryFile() as graph, | 
					
					
						
						| 
							 | 
						          NamedTemporaryFile(mode='rb') as clusters, | 
					
					
						
						| 
							 | 
						          NamedTemporaryFile(mode='rb') as senses): | 
					
					
						
						| 
							 | 
						        nx.write_edgelist(G, graph.name, delimiter='\t', data=['weight']) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        try: | 
					
					
						
						| 
							 | 
						            result = subprocess.run(['java', '-jar', jar, | 
					
					
						
						| 
							 | 
						                                     '--input', graph.name, '--output', clusters.name, '--seed', str(seed), | 
					
					
						
						| 
							 | 
						                                     *ALGORITHMS[algorithm].args_clustering()], | 
					
					
						
						| 
							 | 
						                                    capture_output=True, text=True, timeout=timeout) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						            if result.returncode != 0: | 
					
					
						
						| 
							 | 
						                raise gr.Error(f'Clustering error (code {result.returncode}): {result.stderr}') | 
					
					
						
						| 
							 | 
						        except subprocess.SubprocessError as e: | 
					
					
						
						| 
							 | 
						            raise gr.Error(f'Clustering error: {e}') | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        df_clusters = pd.read_csv(clusters, sep='\t', names=('cluster', 'size', 'items'), | 
					
					
						
						| 
							 | 
						                                  dtype={'cluster': int, 'size': int, 'items': str}) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        df_clusters['items'] = df_clusters['items'].str.split(', ') | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        if ALGORITHMS[algorithm].name == 'watset': | 
					
					
						
						| 
							 | 
						            try: | 
					
					
						
						| 
							 | 
						                result = subprocess.run(['java', '-jar', jar, | 
					
					
						
						| 
							 | 
						                                         '--input', graph.name, '--output', senses.name, '--seed', str(seed), | 
					
					
						
						| 
							 | 
						                                         'graph', *ALGORITHMS[algorithm].args_graph()], | 
					
					
						
						| 
							 | 
						                                        capture_output=True, text=True, timeout=timeout) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						                if result.returncode != 0: | 
					
					
						
						| 
							 | 
						                    raise gr.Error(f'Graph error (code {result.returncode}): {result.stderr}') | 
					
					
						
						| 
							 | 
						            except subprocess.SubprocessError as e: | 
					
					
						
						| 
							 | 
						                raise gr.Error(f'Graph error: {e}') | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						            G_senses = nx.read_edgelist(senses.name, delimiter='\t', comments='\n', data=[('weight', float)]) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						            return df_clusters, G_senses | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        return df_clusters, None | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def handler(file: BinaryIO, algorithm: str, seed: int) -> tuple[pd.DataFrame, Figure]: | 
					
					
						
						| 
							 | 
						    if file is None: | 
					
					
						
						| 
							 | 
						        raise gr.Error('File must be uploaded') | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    if algorithm not in ALGORITHMS: | 
					
					
						
						| 
							 | 
						        raise gr.Error(f'Unknown algorithm: {algorithm}') | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    with open(file.name) as f: | 
					
					
						
						| 
							 | 
						        try: | 
					
					
						
						| 
							 | 
						            dialect = csv.Sniffer().sniff(f.read(4096)) | 
					
					
						
						| 
							 | 
						            delimiter = dialect.delimiter | 
					
					
						
						| 
							 | 
						        except csv.Error: | 
					
					
						
						| 
							 | 
						            delimiter = ',' | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    G: 'nx.Graph[str]' = nx.read_edgelist(file.name, delimiter=delimiter, comments='\n', data=[('weight', float)]) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    mapping: dict[str, int] = {} | 
					
					
						
						| 
							 | 
						    reverse: dict[int, str] = {} | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    for i, node in enumerate(G): | 
					
					
						
						| 
							 | 
						        mapping[node] = i | 
					
					
						
						| 
							 | 
						        reverse[i] = node | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    nx.relabel_nodes(G, mapping, copy=False) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    df_clusters, G_senses = watset(G, algorithm=algorithm, seed=seed) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    nx.relabel_nodes(G, reverse, copy=False) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    df_clusters['items'] = df_clusters['items'].apply(lambda items: sorted(reverse[int(item)] for item in items)) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    if G_senses is None: | 
					
					
						
						| 
							 | 
						        fig = visualize(G, seed=seed) | 
					
					
						
						| 
							 | 
						    else: | 
					
					
						
						| 
							 | 
						        sense_mapping = {node: f'{reverse[int(match["item"])]}#{match["sense"]}'   | 
					
					
						
						| 
							 | 
						                         for node in G_senses for match in (SENSE.match(node),)} | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        nx.relabel_nodes(G_senses, sense_mapping, copy=False) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        fig = visualize(G_senses, seed=seed) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    return df_clusters, fig | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def main() -> None: | 
					
					
						
						| 
							 | 
						    iface = gr.Interface( | 
					
					
						
						| 
							 | 
						        fn=handler, | 
					
					
						
						| 
							 | 
						        inputs=[ | 
					
					
						
						| 
							 | 
						            gr.File( | 
					
					
						
						| 
							 | 
						                file_types=['.tsv', '.csv'], | 
					
					
						
						| 
							 | 
						                label='Graph' | 
					
					
						
						| 
							 | 
						            ), | 
					
					
						
						| 
							 | 
						            gr.Dropdown( | 
					
					
						
						| 
							 | 
						                choices=cast(list[str], ALGORITHMS), | 
					
					
						
						| 
							 | 
						                value='Watset[MCL, CW_lin]', | 
					
					
						
						| 
							 | 
						                label='Algorithm' | 
					
					
						
						| 
							 | 
						            ), | 
					
					
						
						| 
							 | 
						            gr.Number( | 
					
					
						
						| 
							 | 
						                label='Seed', | 
					
					
						
						| 
							 | 
						                precision=0 | 
					
					
						
						| 
							 | 
						            ) | 
					
					
						
						| 
							 | 
						        ], | 
					
					
						
						| 
							 | 
						        outputs=[ | 
					
					
						
						| 
							 | 
						            gr.Dataframe( | 
					
					
						
						| 
							 | 
						                headers=['cluster', 'size', 'items'], | 
					
					
						
						| 
							 | 
						                label='Clustering' | 
					
					
						
						| 
							 | 
						            ), | 
					
					
						
						| 
							 | 
						            gr.Plot( | 
					
					
						
						| 
							 | 
						                label='Graph' | 
					
					
						
						| 
							 | 
						            ) | 
					
					
						
						| 
							 | 
						        ], | 
					
					
						
						| 
							 | 
						        examples=[ | 
					
					
						
						| 
							 | 
						            ['java.tsv', 'Watset[MCL, CW_lin]', 0], | 
					
					
						
						| 
							 | 
						            ['java.tsv', 'MaxMax', 0], | 
					
					
						
						| 
							 | 
						            ['bank.tsv', 'Watset[MCL, MCL]', 0], | 
					
					
						
						| 
							 | 
						            ['bank.tsv', 'MCL', 0], | 
					
					
						
						| 
							 | 
						        ], | 
					
					
						
						| 
							 | 
						        title='Structure Discovery with Watset', | 
					
					
						
						| 
							 | 
						        description=''' | 
					
					
						
						| 
							 | 
						**Watset** is a powerful algorithm for structure discovery in undirected graphs. | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						By capturing the ambiguity of nodes in a graph, Watset efficiently finds clusters in the input data. | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						As the input, this tool expects [edge list](https://en.wikipedia.org/wiki/Edge_list) as a comma-separated (CSV) file without header. | 
					
					
						
						| 
							 | 
						Each line of the file should contain three columns: | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						- `source`: edge source | 
					
					
						
						| 
							 | 
						- `target`: edge target | 
					
					
						
						| 
							 | 
						- `weight`: edge weight | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						Whether you're working with linguistic data or other networks, Watset is the go-to solution for unlocking hidden patterns and structures. | 
					
					
						
						| 
							 | 
						        ''', | 
					
					
						
						| 
							 | 
						        article=''' | 
					
					
						
						| 
							 | 
						**More Watset:** | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						- Paper: <https://doi.org/10.1162/COLI_a_00354> ([arXiv](https://arxiv.org/abs/1808.06696)) | 
					
					
						
						| 
							 | 
						- GitHub: <https://github.com/nlpub/watset-java> | 
					
					
						
						| 
							 | 
						- Maven Central: <https://search.maven.org/artifact/org.nlpub/watset> | 
					
					
						
						| 
							 | 
						- conda-forge: <https://anaconda.org/conda-forge/watset> | 
					
					
						
						| 
							 | 
						        ''', | 
					
					
						
						| 
							 | 
						        flagging_mode='never', | 
					
					
						
						| 
							 | 
						    ) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    iface.launch() | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						if __name__ == '__main__': | 
					
					
						
						| 
							 | 
						    main() | 
					
					
						
						| 
							 | 
						
 |