Spaces:
Runtime error
Runtime error
| import random | |
| import biotite | |
| import numpy as np | |
| import torch.utils.data as data | |
| from typing import List | |
| from biotite.structure.residues import get_residues | |
| from biotite.sequence import ProteinSequence | |
| from biotite.structure.io import pdbx, pdb | |
| from biotite.structure import filter_backbone | |
| from biotite.structure import get_chains | |
| def load_structure(fpath, chain=None): | |
| """ | |
| Args: | |
| fpath: filepath to either pdb or cif file | |
| chain: the chain id or list of chain ids to load | |
| Returns: | |
| biotite.structure.AtomArray | |
| """ | |
| if fpath.endswith('cif'): | |
| with open(fpath) as fin: | |
| pdbxf = pdbx.PDBxFile.read(fin) | |
| structure = pdbx.get_structure(pdbxf, model=1) | |
| elif fpath.endswith('pdb'): | |
| with open(fpath) as fin: | |
| pdbf = pdb.PDBFile.read(fin) | |
| structure = pdb.get_structure(pdbf, model=1) | |
| bbmask = filter_backbone(structure) | |
| structure = structure[bbmask] | |
| all_chains = get_chains(structure) | |
| if len(all_chains) == 0: | |
| raise ValueError('No chains found in the input file.') | |
| if chain is None: | |
| chain_ids = all_chains | |
| elif isinstance(chain, list): | |
| chain_ids = chain | |
| else: | |
| chain_ids = [chain] | |
| for chain in chain_ids: | |
| if chain not in all_chains: | |
| raise ValueError(f'Chain {chain} not found in input file') | |
| chain_filter = [a.chain_id in chain_ids for a in structure] | |
| structure = structure[chain_filter] | |
| return structure | |
| def get_atom_coords_residuewise(atoms: List[str], struct: biotite.structure.AtomArray): | |
| """ | |
| Example for atoms argument: ["N", "CA", "C"] | |
| """ | |
| def filterfn(s, axis=None): | |
| filters = np.stack([s.atom_name == name for name in atoms], axis=1) | |
| sum = filters.sum(0) | |
| if not np.all(sum <= np.ones(filters.shape[1])): | |
| raise RuntimeError("structure has multiple atoms with same name") | |
| index = filters.argmax(0) | |
| coords = s[index].coord | |
| coords[sum == 0] = float("nan") | |
| return coords | |
| return biotite.structure.apply_residue_wise(struct, struct, filterfn) | |
| def extract_coords_from_structure(structure: biotite.structure.AtomArray): | |
| """ | |
| Args: | |
| structure: An instance of biotite AtomArray | |
| Returns: | |
| Tuple (coords, seq) | |
| - coords is an L x 3 x 3 array for N, CA, C coordinates | |
| - seq is the extracted sequence | |
| """ | |
| coords = get_atom_coords_residuewise(["N", "CA", "C"], structure) | |
| residue_identities = get_residues(structure)[1] | |
| seq = ''.join([ProteinSequence.convert_letter_3to1(r) for r in residue_identities]) | |
| return coords | |
| def extract_seq_from_pdb(pdb_file, chain=None): | |
| """ | |
| Args: | |
| structure: An instance of biotite AtomArray | |
| Returns: | |
| - seq is the extracted sequence | |
| """ | |
| structure = load_structure(pdb_file, chain) | |
| residue_identities = get_residues(structure)[1] | |
| seq = ''.join([ProteinSequence.convert_letter_3to1(r) for r in residue_identities]) | |
| return seq | |