批量PDB转一个jsonl
Posted Mario cai
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了批量PDB转一个jsonl相关的知识,希望对你有一定的参考价值。
from Bio import PDB
import numpy as np
import pandas as pd
import scipy.stats as stats
import ast
import matplotlib.pyplot as plt
import seaborn as sns
import xlwt
import re
import os
import time
from tqdm import *
from time import time
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import offsetbox
from sklearn import manifold, datasets
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import pandas as pd
import seaborn as sns
import jsonlines # 导入
seq2 =
'ALA': 'A',
'CYS': 'C',
'ASP': 'D',
'GLU': 'E',
'PHE': 'F',
'GLY': 'G',
'HIS': 'H',
'ILE': 'I',
'LYS': 'K',
'LEU': 'L',
'MET': 'M',
'ASN': 'N',
'PRO': 'P',
'GLN': 'Q',
'ARG': 'R',
'SER': 'S',
'THR': 'T',
'VAL': 'V',
'TRP': 'W',
'TYR': 'Y',
'GAP': '-'
TOTAL = []
file = r'D:\\ProteinMPNN-main\\TS500\\pdb'
abs_path = os.path.abspath(file)
nnnn = os.listdir(abs_path)
NAME = []
COORDS = []
SEQU = []
for i in tqdm(range(0, len(nnnn))):
coords =
file = r'D:\\ProteinMPNN-main\\TS500\\pdb'
name = nnnn[i][0:nnnn[i][0:20].rfind('.')]
txtname = fr'file\\nnnn[i]'
fopen = open(txtname, 'r')
lines = fopen.readlines()
file = xlwt.Workbook(encoding='utf-8', style_compression=0)
sheet = file.add_sheet('data')
data = []
i = 0
for line in lines:
line = line.strip(' ').split('\\t')
data.append(line)
i = i + 1
data = np.array(data)
data = np.transpose(data)
x, y = data.shape
N = []
CA = []
C = []
O = []
SEQ = ''
type = 'V'
for i in range(0, y - 1):
typ = (data[0][i][20:22].replace(" ", ""))
if typ == f'type':
# print(str(data[0][i][16:20].replace(" ", "")))
X = []
XX = []
XXX = []
XXXX = []
if data[0][i][13:15].replace(" ", "") == 'N':
if data[0][i + 1][13:16].replace(" ", "") == 'CA':
if data[0][i + 2][13:15].replace(" ", "") == 'C':
if data[0][i + 3][13:15].replace(" ", "") == 'O':
X.append(float(data[0][i][26:38].replace(" ", "")))
X.append(float(data[0][i][38:46].replace(" ", "")))
X.append(float(data[0][i][46:55].replace(" ", "")))
N.append(X)
XX.append(float(data[0][i+1][26:38].replace(" ", "")))
XX.append(float(data[0][i+1][38:46].replace(" ", "")))
XX.append(float(data[0][i+1][46:55].replace(" ", "")))
CA.append(XX)
XXX.append(float(data[0][i+2][26:38].replace(" ", "")))
XXX.append(float(data[0][i+2][38:46].replace(" ", "")))
XXX.append(float(data[0][i+2][46:55].replace(" ", "")))
C.append(XXX)
XXXX.append(float(data[0][i+3][26:38].replace(" ", "")))
XXXX.append(float(data[0][i+3][38:46].replace(" ", "")))
XXXX.append(float(data[0][i+3][46:55].replace(" ", "")))
O.append(XXXX)
SEQ = SEQ + str(seq2[data[0][i][16:20].replace(" ", "")])
# break
if N != []:
ddf =
f'N_chain_type': N,
f'CA_chain_type': CA,
f'C_chain_type': C,
f'O_chain_type': O,
coords.update(ddf)
NAME.append(name)
COORDS.append(coords)
SEQU.append(SEQ)
with jsonlines.open(fr"D:\\ProteinMPNN-main\\TS500\\jsonl/type.jsonl", 'w') as w:
for i in range(0, len(NAME)):
w.write(f"coords_chain_type": COORDS[i], "name": NAME[i], 'num_of_chains': 1,
'seq': SEQU[i], f"seq_chain_type": SEQU[i],
)
# TYPE.append(type)
# break
以上是关于批量PDB转一个jsonl的主要内容,如果未能解决你的问题,请参考以下文章