当前位置: 首页>>技术教程>>正文


sklearn例程:SVD计算维基百科主要特征向量

维基百科主要特征向量简介

断言图中顶点相对重要性的一种经典方法是计算邻接矩阵的主要特征向量,以便将每个特征向量的分量值作为中心度得分分配给每个顶点:

https://en.wikipedia.org/wiki/Eigenvector_centrality 【特征向量中心度】

在网页和链接的图上,这些值被Google称为PageRank得分。

本示例的目的是分析维基百科文章内部的链接图,以根据此特征向量中心性按相对重要性对文章进行排名。

计算主特征向量的传统方法是使用幂迭代方法:

https://en.wikipedia.org/wiki/Power_iteration 【幂迭代】

在这里,借助Martinsson在scikit-learn中实现的随机SVD算法,可以实现主特征向量的计算。

我们从DBpedia转储中获取图数据, DBpedia是对Wikipedia内容的潜在结构化数据的提取结果。

 

代码实现[Python]



# -*- coding: utf-8 -*- 



# Author: Olivier Grisel 

# License: BSD 3 clause



from bz2 import BZ2File

import os

from datetime import datetime

from pprint import pprint

from time import time



import numpy as np



from scipy import sparse



from joblib import Memory



from sklearn.decomposition import randomized_svd

from urllib.request import urlopen





print(__doc__)



# #############################################################################

# 下载数据

redirects_url = "http://downloads.dbpedia.org/3.5.1/en/redirects_en.nt.bz2"

redirects_filename = redirects_url.rsplit("/", 1)[1]



page_links_url = "http://downloads.dbpedia.org/3.5.1/en/page_links_en.nt.bz2"

page_links_filename = page_links_url.rsplit("/", 1)[1]



resources = [

    (redirects_url, redirects_filename),

    (page_links_url, page_links_filename),

]



for url, filename in resources:

    if not os.path.exists(filename):

        print("Downloading data from '%s', please wait..." % url)

        opener = urlopen(url)

        open(filename, 'wb').write(opener.read())

        print()





# #############################################################################

# 导入redirect文件



memory = Memory(cachedir=".")





def index(redirects, index_map, k):

    """Find the index of an article name after redirect resolution"""

    k = redirects.get(k, k)

    return index_map.setdefault(k, len(index_map))





DBPEDIA_RESOURCE_PREFIX_LEN = len("http://dbpedia.org/resource/")

SHORTNAME_SLICE = slice(DBPEDIA_RESOURCE_PREFIX_LEN + 1, -1)





def short_name(nt_uri):

    """Remove the  URI markers and the common URI prefix"""

    return nt_uri[SHORTNAME_SLICE]





def get_redirects(redirects_filename):

    """Parse the redirections and build a transitively closed map out of it"""

    redirects = {}

    print("Parsing the NT redirect file")

    for l, line in enumerate(BZ2File(redirects_filename)):

        split = line.split()

        if len(split) != 4:

            print("ignoring malformed line: " + line)

            continue

        redirects[short_name(split[0])] = short_name(split[2])

        if l % 1000000 == 0:

            print("[%s] line: %08d" % (datetime.now().isoformat(), l))



    # compute the transitive closure

    print("Computing the transitive closure of the redirect relation")

    for l, source in enumerate(redirects.keys()):

        transitive_target = None

        target = redirects[source]

        seen = {source}

        while True:

            transitive_target = target

            target = redirects.get(target)

            if target is None or target in seen:

                break

            seen.add(target)

        redirects[source] = transitive_target

        if l % 1000000 == 0:

            print("[%s] line: %08d" % (datetime.now().isoformat(), l))



    return redirects





# disabling joblib as the pickling of large dicts seems much too slow

#@memory.cache

def get_adjacency_matrix(redirects_filename, page_links_filename, limit=None):

    """Extract the adjacency graph as a scipy sparse matrix



    Redirects are resolved first.



    Returns X, the scipy sparse adjacency matrix, redirects as python

    dict from article names to article names and index_map a python dict

    from article names to python int (article indexes).

    """



    print("Computing the redirect map")

    redirects = get_redirects(redirects_filename)



    print("Computing the integer index map")

    index_map = dict()

    links = list()

    for l, line in enumerate(BZ2File(page_links_filename)):

        split = line.split()

        if len(split) != 4:

            print("ignoring malformed line: " + line)

            continue

        i = index(redirects, index_map, short_name(split[0]))

        j = index(redirects, index_map, short_name(split[2]))

        links.append((i, j))

        if l % 1000000 == 0:

            print("[%s] line: %08d" % (datetime.now().isoformat(), l))



        if limit is not None and l >= limit - 1:

            break



    print("Computing the adjacency matrix")

    X = sparse.lil_matrix((len(index_map), len(index_map)), dtype=np.float32)

    for i, j in links:

        X[i, j] = 1.0

    del links

    print("Converting to CSR representation")

    X = X.tocsr()

    print("CSR conversion done")

    return X, redirects, index_map





# stop after 5M links to make it possible to work in RAM

X, redirects, index_map = get_adjacency_matrix(

    redirects_filename, page_links_filename, limit=5000000)

names = {i: name for name, i in index_map.items()}



print("Computing the principal singular vectors using randomized_svd")

t0 = time()

U, s, V = randomized_svd(X, 5, n_iter=3)

print("done in %0.3fs" % (time() - t0))



# print the names of the wikipedia related strongest components of the

# principal singular vector which should be similar to the highest eigenvector

print("Top wikipedia pages according to principal singular vectors")

pprint([names[i] for i in np.abs(U.T[0]).argsort()[-10:]])

pprint([names[i] for i in np.abs(V[0]).argsort()[-10:]])





def centrality_scores(X, alpha=0.85, max_iter=100, tol=1e-10):

    """Power iteration computation of the principal eigenvector



    This method is also known as Google PageRank and the implementation

    is based on the one from the NetworkX project (BSD licensed too)

    with copyrights by:



      Aric Hagberg 

      Dan Schult 

      Pieter Swart 

    """

    n = X.shape[0]

    X = X.copy()

    incoming_counts = np.asarray(X.sum(axis=1)).ravel()



    print("Normalizing the graph")

    for i in incoming_counts.nonzero()[0]:

        X.data[X.indptr[i]:X.indptr[i + 1]] *= 1.0 / incoming_counts[i]

    dangle = np.asarray(np.where(np.isclose(X.sum(axis=1), 0),

                                 1.0 / n, 0)).ravel()



    scores = np.full(n, 1. / n, dtype=np.float32)  # initial guess

    for i in range(max_iter):

        print("power iteration #%d" % i)

        prev_scores = scores

        scores = (alpha * (scores * X + np.dot(dangle, prev_scores))

                  + (1 - alpha) * prev_scores.sum() / n)

        # check convergence: normalized l_inf norm

        scores_max = np.abs(scores).max()

        if scores_max == 0.0:

            scores_max = 1.0

        err = np.abs(scores - prev_scores).max() / scores_max

        print("error: %0.6f" % err)

        if err < n * tol:

            return scores



    return scores



print("Computing principal eigenvector score using a power iteration method")

t0 = time()

scores = centrality_scores(X, max_iter=100, tol=1e-10)

print("done in %0.3fs" % (time() - t0))

pprint([names[i] for i in np.abs(scores).argsort()[-10:]])



代码执行

代码运行时间大约:0分0.000秒。

源码下载

参考资料

本文由《堆栈答案》整理。文章地址: https://stackanswer.com/article/4501.html,未经允许,请勿转载。