使用 python 从 wikidata 转储中提取别名
Posted
技术标签:
【中文标题】使用 python 从 wikidata 转储中提取别名【英文标题】:extract aliases from wikidata dump using python 【发布时间】:2022-01-10 16:38:10 【问题描述】:我正在尝试从 wikidata 转储中提取有关 wikidata 项目的某些字段,但我对某种语言的 aliases
字段有疑问,我的代码基于以下 URL how_to_use_a_wikidata_dump 中的代码,我进行了我的修改,但 aliases
字段返回空值:
for record in wikidata(args.dumpfile):
print('i = '+str(i)+' item '+record['id']+' started!'+'\n')
item_id = pydash.get(record, 'id')
item_type = pydash.get(record, 'claims.P31[0].mainsnak.datavalue.value.id')
arabic_label = pydash.get(record, 'labels.ar.value')
english_label = pydash.get(record, 'labels.en.value')
arabic_aliases =pydash.get(record, 'aliases.ar.value')
english_aliases =pydash.get(record, 'aliases.en.value')
arabic_desc = pydash.get(record, 'descriptions.ar.value')
english_desc = pydash.get(record, 'descriptions.en.value')
main_category = pydash.get(record, 'claims.P910[0].mainsnak.datavalue.value.id')
arwiki = pydash.get(record, 'sitelinks.arwiki.title')
arwikiquote = pydash.get(record, 'sitelinks.arwikiquote.title')
enwiki = pydash.get(record, 'sitelinks.enwiki.title')
enwikiquote = pydash.get(record, 'sitelinks.enwiki
quote.title')
wikidata 项目的 JSON 格式可以在这里找到: JSON Format
JSON 记录示例
"pageid": 186,
"ns": 0,
"title": "Q60",
"lastrevid": 199780882,
"modified": "2020-02-27T14:37:20Z",
"id": "Q60",
"type": "item",
"aliases":
"en": [
"language": "en",
"value": "NYC"
,
"language": "en",
"value": "New York"
],
"fr": [
"language": "fr",
"value": "New York City"
,
"language": "fr",
"value": "NYC"
],
"zh-mo": [
"language": "zh-mo",
"value": "\u7d10\u7d04\u5e02"
]
,
"labels":
"en":
"language": "en",
"value": "New York City"
,
"ar":
"language": "ar",
"value": "\u0645\u062f\u064a\u0646\u0629 \u0646\u064a\u0648 \u064a\u0648\u0631\u0643"
,
"fr":
"language": "fr",
"value": "New York City"
,
"my":
"language": "my",
"value": "\u1014\u101a\u1030\u1038\u101a\u1031\u102c\u1000\u103a\u1019\u103c\u102d\u102f\u1037"
,
"ps":
"language": "ps",
"value": "\u0646\u064a\u0648\u064a\u0627\u0631\u06a9"
,
"descriptions":
"en":
"language": "en",
"value": "largest city in New York and the United States of America"
,
"it":
"language": "it",
"value": "citt\u00e0 degli Stati Uniti d'America"
,
"pl":
"language": "pl",
"value": "miasto w Stanach Zjednoczonych"
,
"ro":
"language": "ro",
"value": "ora\u015ful cel mai mare din SUA"
,
"claims":
"P1151": [
"id": "Q60$6f832804-4c3f-6185-38bd-ca00b8517765",
"mainsnak":
"snaktype": "value",
"property": "P1151",
"datatype": "wikibase-item",
"datavalue":
"value":
"entity-type": "item",
"id": "Q6342720",
"numeric-id": 6342720
,
"type": "wikibase-entityid"
,
"type": "statement",
"rank": "normal"
],
"P625": [
"id": "q60$f00c56de-4bac-e259-b146-254897432868",
"mainsnak":
"snaktype": "value",
"property": "P625",
"datatype": "globe-coordinate",
"datavalue":
"value":
"latitude": 40.67,
"longitude": -73.94,
"altitude": null,
"precision": 0.00027777777777778,
"globe": "http://www.wikidata.org/entity/Q2"
,
"type": "globecoordinate"
,
"type": "statement",
"rank": "normal",
"references": [
"hash": "7eb64cf9621d34c54fd4bd040ed4b61a88c4a1a0",
"snaks":
"P143": [
"snaktype": "value",
"property": "P143",
"datatype": "wikibase-item",
"datavalue":
"value":
"entity-type": "item",
"id": "Q328",
"numeric-id": 328
,
"type": "wikibase-entityid"
]
,
"snaks-order": [
"P143"
]
]
],
"P150": [
"id": "Q60$bdddaa06-4e4b-f369-8954-2bb010aaa057",
"mainsnak":
"snaktype": "value",
"property": "P150",
"datatype": "wikibase-item",
"datavalue":
"value":
"entity-type": "item",
"id": "Q11299",
"numeric-id": 11299
,
"type": "wikibase-entityid"
,
"type": "statement",
"rank": "normal"
,
"id": "Q60$0e484d5b-41a5-1594-7ae1-c3768c6206f6",
"mainsnak":
"snaktype": "value",
"property": "P150",
"datatype": "wikibase-item",
"datavalue":
"value":
"entity-type": "item",
"id": "Q18419",
"numeric-id": 18419
,
"type": "wikibase-entityid"
,
"type": "statement",
"rank": "normal"
,
"id": "Q60$e5000a60-42fc-2aba-f16d-bade1d2e8a58",
"mainsnak":
"snaktype": "value",
"property": "P150",
"datatype": "wikibase-item",
"datavalue":
"value":
"entity-type": "item",
"id": "Q18424",
"numeric-id": 18424
,
"type": "wikibase-entityid"
,
"type": "statement",
"rank": "normal"
,
"id": "Q60$4d90d6f4-4ab8-26bd-f2a5-4ac2a6eb48cd",
"mainsnak":
"snaktype": "value",
"property": "P150",
"datatype": "wikibase-item",
"datavalue":
"value":
"entity-type": "item",
"id": "Q18426",
"numeric-id": 18426
,
"type": "wikibase-entityid"
,
"type": "statement",
"rank": "normal"
,
"id": "Q60$ede49e3c-44f6-75a3-eb74-6a89886e30c9",
"mainsnak":
"snaktype": "value",
"property": "P150",
"datatype": "wikibase-item",
"datavalue":
"value":
"entity-type": "item",
"id": "Q18432",
"numeric-id": 18432
,
"type": "wikibase-entityid"
,
"type": "statement",
"rank": "normal"
],
"P6": [
"id": "Q60$5cc8fc79-4807-9800-dbea-fe9c20ab273b",
"mainsnak":
"snaktype": "value",
"property": "P6",
"datatype": "wikibase-item",
"datavalue":
"value":
"entity-type": "item",
"id": "Q4911497",
"numeric-id": 4911497
,
"type": "wikibase-entityid"
,
"qualifiers":
"P580": [
"hash": "c53f3ca845b789e543ed45e3e1ecd1dd950e30dc",
"snaktype": "value",
"property": "P580",
"datatype": "time",
"datavalue":
"value":
"time": "+00000002014-01-01T00:00:00Z",
"timezone": 0,
"before": 0,
"after": 0,
"precision": 11,
"calendarmodel": "http://www.wikidata.org/entity/Q1985727"
,
"type": "time"
]
,
"qualifiers-order": [
"P580"
],
"type": "statement",
"rank": "preferred"
,
"id": "q60$cad4e313-4b5e-e089-08b9-3b1c7998e762",
"mainsnak":
"snaktype": "value",
"property": "P6",
"datatype": "wikibase-item",
"datavalue":
"value":
"entity-type": "item",
"id": "Q607",
"numeric-id": 607
,
"type": "wikibase-entityid"
,
"qualifiers":
"P580": [
"hash": "47c515b79f80e24e03375b327f2ac85184765d5b",
"snaktype": "value",
"property": "P580",
"datatype": "time",
"datavalue":
"value":
"time": "+00000002002-01-01T00:00:00Z",
"timezone": 0,
"before": 0,
"after": 0,
"precision": 11,
"calendarmodel": "http://www.wikidata.org/entity/Q1985727"
,
"type": "time"
],
"P582": [
"hash": "1f463f78538c49ef6adf3a9b18e211af7195240a",
"snaktype": "value",
"property": "P582",
"datatype": "time",
"datavalue":
"value":
"time": "+00000002013-12-31T00:00:00Z",
"timezone": 0,
"before": 0,
"after": 0,
"precision": 11,
"calendarmodel": "http://www.wikidata.org/entity/Q1985727"
,
"type": "time"
]
,
"qualifiers-order": [
"P580",
"P582"
]
],
"P856": [
"id": "Q60$4e3e7a42-4ec4-b7c3-7570-b103eb2bc1ac",
"mainsnak":
"snaktype": "value",
"property": "P856",
"datatype": "url",
"datavalue":
"value": "http://nyc.gov/",
"type": "string"
,
"type": "statement",
"rank": "normal"
]
,
"sitelinks":
"afwiki":
"site": "afwiki",
"title": "New York Stad",
"badges": []
,
"dewiki":
"site": "dewiki",
"title": "New York City",
"badges": [
"Q17437798"
]
,
"dewikinews":
"site": "dewikinews",
"title": "Kategorie:New York",
"badges": []
,
"elwiki":
"site": "elwiki",
"title": "\u039d\u03ad\u03b1 \u03a5\u03cc\u03c1\u03ba\u03b7",
"badges": []
,
"enwiki":
"site": "enwiki",
"title": "New York City",
"badges": []
,
"zhwikivoyage":
"site": "zhwikivoyage",
"title": "\u7d10\u7d04",
"badges": []
,
"zuwiki":
"site": "zuwiki",
"title": "New York (idolobha)",
"badges": []
这段代码的结果是:
english_aliases =pydash.get(record, 'aliases.en')
print(type(arabic_aliases))
print(english_aliases)
['language': 'en', 'value': '比利时王国', 'language': 'en', 'value': 'BEL', 'language': 'en', 'value ': 'be', 'language': 'en', 'value': '????????', 'language': 'en', 'value': 'BE']
【问题讨论】:
它总是为空吗?我不希望所有条目都有别名。您是否有具有别名且此代码失败的数据记录示例?乍一看,还可以。 我在json记录中添加了一个例子,问题是别名可能不止一个,它存储在一个列表中,所以我不知道如何提取所有英文别名示例 转储的原始 SPARQL 查询或选择是什么? akbaritabar.netlify.app/how_to_use_a_wikidata_dump这是原码 【参考方案1】:答案是:
english_aliases= set()
if pydash.has(record, 'aliases.en'):
for itm in pydash.get(record, 'aliases.en'):
english_aliases.add(itm['value'])
【讨论】:
对,aliases.en
是一个列表,不像labels.en
是一个单一的值。以上是关于使用 python 从 wikidata 转储中提取别名的主要内容,如果未能解决你的问题,请参考以下文章