使用 python 从 wikidata 转储中提取别名

Posted

技术标签:

【中文标题】使用 python 从 wikidata 转储中提取别名【英文标题】:extract aliases from wikidata dump using python 【发布时间】:2022-01-10 16:38:10 【问题描述】:

我正在尝试从 wikidata 转储中提取有关 wikidata 项目的某些字段,但我对某种语言的 aliases 字段有疑问,我的代码基于以下 URL how_to_use_a_wikidata_dump 中的代码,我进行了我的修改,但 aliases 字段返回空值:

 for record in wikidata(args.dumpfile):
    print('i = '+str(i)+' item '+record['id']+'  started!'+'\n')
    item_id = pydash.get(record, 'id')
    item_type = pydash.get(record, 'claims.P31[0].mainsnak.datavalue.value.id')
    arabic_label = pydash.get(record, 'labels.ar.value')
    english_label = pydash.get(record, 'labels.en.value')
    arabic_aliases =pydash.get(record, 'aliases.ar.value')
    english_aliases =pydash.get(record, 'aliases.en.value')
    arabic_desc = pydash.get(record, 'descriptions.ar.value')
    english_desc = pydash.get(record, 'descriptions.en.value')
    main_category = pydash.get(record, 'claims.P910[0].mainsnak.datavalue.value.id')
    arwiki = pydash.get(record, 'sitelinks.arwiki.title')
    arwikiquote = pydash.get(record, 'sitelinks.arwikiquote.title')
    enwiki = pydash.get(record, 'sitelinks.enwiki.title')
    enwikiquote = pydash.get(record, 'sitelinks.enwiki

quote.title')

wikidata 项目的 JSON 格式可以在这里找到: JSON Format

JSON 记录示例


  "pageid": 186,
  "ns": 0,
  "title": "Q60",
  "lastrevid": 199780882,
  "modified": "2020-02-27T14:37:20Z",
  "id": "Q60",
  "type": "item",
  "aliases": 
    "en": [
      
        "language": "en",
        "value": "NYC"
      ,
      
        "language": "en",
        "value": "New York"
      
    ],
    "fr": [
      
        "language": "fr",
        "value": "New York City"
      ,
      
        "language": "fr",
        "value": "NYC"
      
    ],
    "zh-mo": [
      
        "language": "zh-mo",
        "value": "\u7d10\u7d04\u5e02"
      
    ]
  ,
  "labels": 
    "en": 
      "language": "en",
      "value": "New York City"
    ,
    "ar": 
      "language": "ar",
      "value": "\u0645\u062f\u064a\u0646\u0629 \u0646\u064a\u0648 \u064a\u0648\u0631\u0643"
    ,
    "fr": 
      "language": "fr",
      "value": "New York City"
    ,
    "my": 
      "language": "my",
      "value": "\u1014\u101a\u1030\u1038\u101a\u1031\u102c\u1000\u103a\u1019\u103c\u102d\u102f\u1037"
    ,
    "ps": 
      "language": "ps",
      "value": "\u0646\u064a\u0648\u064a\u0627\u0631\u06a9"
    
  ,
  "descriptions": 
    "en": 
      "language": "en",
      "value": "largest city in New York and the United States of America"
    ,
    "it": 
      "language": "it",
      "value": "citt\u00e0 degli Stati Uniti d'America"
    ,
    "pl": 
      "language": "pl",
      "value": "miasto w Stanach Zjednoczonych"
    ,
    "ro": 
      "language": "ro",
      "value": "ora\u015ful cel mai mare din SUA"
    
  ,
  "claims": 
    "P1151": [
      
        "id": "Q60$6f832804-4c3f-6185-38bd-ca00b8517765",
        "mainsnak": 
          "snaktype": "value",
          "property": "P1151",
          "datatype": "wikibase-item",
          "datavalue": 
            "value": 
              "entity-type": "item",
              "id": "Q6342720",
              "numeric-id": 6342720
            ,
            "type": "wikibase-entityid"
          
        ,
        "type": "statement",
        "rank": "normal"
      
    ],
    "P625": [
      
        "id": "q60$f00c56de-4bac-e259-b146-254897432868",
        "mainsnak": 
          "snaktype": "value",
          "property": "P625",
          "datatype": "globe-coordinate",
          "datavalue": 
            "value": 
              "latitude": 40.67,
              "longitude": -73.94,
              "altitude": null,
              "precision": 0.00027777777777778,
              "globe": "http://www.wikidata.org/entity/Q2"
            ,
            "type": "globecoordinate"
          
        ,
        "type": "statement",
        "rank": "normal",
        "references": [
          
            "hash": "7eb64cf9621d34c54fd4bd040ed4b61a88c4a1a0",
            "snaks": 
              "P143": [
                
                  "snaktype": "value",
                  "property": "P143",
                  "datatype": "wikibase-item",
                  "datavalue": 
                    "value": 
                      "entity-type": "item",
                      "id": "Q328",
                      "numeric-id": 328
                    ,
                    "type": "wikibase-entityid"
                  
                
              ]
            ,
            "snaks-order": [
              "P143"
            ]
          
        ]
      
    ],
    "P150": [
      
        "id": "Q60$bdddaa06-4e4b-f369-8954-2bb010aaa057",
        "mainsnak": 
          "snaktype": "value",
          "property": "P150",
          "datatype": "wikibase-item",
          "datavalue": 
            "value": 
              "entity-type": "item",
              "id": "Q11299",
              "numeric-id": 11299
            ,
            "type": "wikibase-entityid"
          
        ,
        "type": "statement",
        "rank": "normal"
      ,
      
        "id": "Q60$0e484d5b-41a5-1594-7ae1-c3768c6206f6",
        "mainsnak": 
          "snaktype": "value",
          "property": "P150",
          "datatype": "wikibase-item",
          "datavalue": 
            "value": 
              "entity-type": "item",
              "id": "Q18419",
              "numeric-id": 18419
            ,
            "type": "wikibase-entityid"
          
        ,
        "type": "statement",
        "rank": "normal"
      ,
      
        "id": "Q60$e5000a60-42fc-2aba-f16d-bade1d2e8a58",
        "mainsnak": 
          "snaktype": "value",
          "property": "P150",
          "datatype": "wikibase-item",
          "datavalue": 
            "value": 
              "entity-type": "item",
              "id": "Q18424",
              "numeric-id": 18424
            ,
            "type": "wikibase-entityid"
          
        ,
        "type": "statement",
        "rank": "normal"
      ,
      
        "id": "Q60$4d90d6f4-4ab8-26bd-f2a5-4ac2a6eb48cd",
        "mainsnak": 
          "snaktype": "value",
          "property": "P150",
          "datatype": "wikibase-item",
          "datavalue": 
            "value": 
              "entity-type": "item",
              "id": "Q18426",
              "numeric-id": 18426
            ,
            "type": "wikibase-entityid"
          
        ,
        "type": "statement",
        "rank": "normal"
      ,
      
        "id": "Q60$ede49e3c-44f6-75a3-eb74-6a89886e30c9",
        "mainsnak": 
          "snaktype": "value",
          "property": "P150",
          "datatype": "wikibase-item",
          "datavalue": 
            "value": 
              "entity-type": "item",
              "id": "Q18432",
              "numeric-id": 18432
            ,
            "type": "wikibase-entityid"
          
        ,
        "type": "statement",
        "rank": "normal"
      
    ],
    "P6": [
      
        "id": "Q60$5cc8fc79-4807-9800-dbea-fe9c20ab273b",
        "mainsnak": 
          "snaktype": "value",
          "property": "P6",
          "datatype": "wikibase-item",
          "datavalue": 
            "value": 
              "entity-type": "item",
              "id": "Q4911497",
              "numeric-id": 4911497
            ,
            "type": "wikibase-entityid"
          
        ,
        "qualifiers": 
          "P580": [
            
              "hash": "c53f3ca845b789e543ed45e3e1ecd1dd950e30dc",
              "snaktype": "value",
              "property": "P580",
              "datatype": "time",
              "datavalue": 
                "value": 
                  "time": "+00000002014-01-01T00:00:00Z",
                  "timezone": 0,
                  "before": 0,
                  "after": 0,
                  "precision": 11,
                  "calendarmodel": "http://www.wikidata.org/entity/Q1985727"
                ,
                "type": "time"
              
            
          ]
        ,
        "qualifiers-order": [
          "P580"
        ],
        "type": "statement",
        "rank": "preferred"
      ,
      
        "id": "q60$cad4e313-4b5e-e089-08b9-3b1c7998e762",
        "mainsnak": 
          "snaktype": "value",
          "property": "P6",
          "datatype": "wikibase-item",
          "datavalue": 
            "value": 
              "entity-type": "item",
              "id": "Q607",
              "numeric-id": 607
            ,
            "type": "wikibase-entityid"
          
        ,
        "qualifiers": 
          "P580": [
            
              "hash": "47c515b79f80e24e03375b327f2ac85184765d5b",
              "snaktype": "value",
              "property": "P580",
              "datatype": "time",
              "datavalue": 
                "value": 
                  "time": "+00000002002-01-01T00:00:00Z",
                  "timezone": 0,
                  "before": 0,
                  "after": 0,
                  "precision": 11,
                  "calendarmodel": "http://www.wikidata.org/entity/Q1985727"
                ,
                "type": "time"
              
            
          ],
          "P582": [
            
              "hash": "1f463f78538c49ef6adf3a9b18e211af7195240a",
              "snaktype": "value",
              "property": "P582",
              "datatype": "time",
              "datavalue": 
                "value": 
                  "time": "+00000002013-12-31T00:00:00Z",
                  "timezone": 0,
                  "before": 0,
                  "after": 0,
                  "precision": 11,
                  "calendarmodel": "http://www.wikidata.org/entity/Q1985727"
                ,
                "type": "time"
              
            
          ]
        ,
        "qualifiers-order": [
          "P580",
          "P582"
        ]
      
    ],
    "P856": [
      
        "id": "Q60$4e3e7a42-4ec4-b7c3-7570-b103eb2bc1ac",
        "mainsnak": 
          "snaktype": "value",
          "property": "P856",
          "datatype": "url",
          "datavalue": 
            "value": "http://nyc.gov/",
            "type": "string"
          
        ,
        "type": "statement",
        "rank": "normal"
      
    ]
  ,
  "sitelinks": 
    "afwiki": 
      "site": "afwiki",
      "title": "New York Stad",
      "badges": []
    ,
    "dewiki": 
      "site": "dewiki",
      "title": "New York City",
      "badges": [
        "Q17437798"
      ]
    ,
    "dewikinews": 
      "site": "dewikinews",
      "title": "Kategorie:New York",
      "badges": []
    ,
    "elwiki": 
      "site": "elwiki",
      "title": "\u039d\u03ad\u03b1 \u03a5\u03cc\u03c1\u03ba\u03b7",
      "badges": []
    ,
    "enwiki": 
      "site": "enwiki",
      "title": "New York City",
      "badges": []
    ,
    "zhwikivoyage": 
      "site": "zhwikivoyage",
      "title": "\u7d10\u7d04",
      "badges": []
    ,
    "zuwiki": 
      "site": "zuwiki",
      "title": "New York (idolobha)",
      "badges": []
    
  

这段代码的结果是:

        english_aliases =pydash.get(record, 'aliases.en')
        print(type(arabic_aliases))
        print(english_aliases)

['language': 'en', 'value': '比利时王国', 'language': 'en', 'value': 'BEL', 'language': 'en', 'value ': 'be', 'language': 'en', 'value': '????????', 'language': 'en', 'value': 'BE']

【问题讨论】:

总是为空吗?我不希望所有条目都有别名。您是否有具有别名且此代码失败的数据记录示例?乍一看,还可以。 我在json记录中添加了一个例子,问题是别名可能不止一个,它存储在一个列表中,所以我不知道如何提取所有英文别名示例 转储的原始 SPARQL 查询或选择是什么? akbaritabar.netlify.app/how_to_use_a_wikidata_dump这是原码 【参考方案1】:

答案是:

    english_aliases= set()
    if pydash.has(record, 'aliases.en'):
        for itm in pydash.get(record, 'aliases.en'):
            english_aliases.add(itm['value'])

【讨论】:

对,aliases.en 是一个列表,不像labels.en 是一个单一的值。

以上是关于使用 python 从 wikidata 转储中提取别名的主要内容,如果未能解决你的问题,请参考以下文章

如何过滤语言的Wikidata dump?

如何从python的list中提想要的值

Wikidata和SparQL简介

wikidata研究和应用

使用 WikiData Sparql 高效查询抽象元素

在 Python 中轻松地将变量从/向命名空间/字典转储