Successfully imported evSeq


import os
import glob

import numpy as np
import pandas as pd
import holoviews as hv
import ninetysix as ns

import evSeq.data_visualization as seqviz


# User input to relevant evSeqOutput folder
evseq_output_dir = '../data/example_notebook/SSM/'


# Read in the file
seq_file = 'AminoAcids_Coupled_Max.csv'
seq_files = os.path.join(evseq_output_dir, seq_file)
seq_df = pd.read_csv(seq_files)
seq_df


# Create a dictionary of the native amino acids for each position
parent_dict = {
    # position: AA
    105: 'E',
    118: 'A',
    162: 'L',
    166: 'I',
    184: 'F',
    228: 'S',
    292: 'S',
    301: 'Y',
}

def set_parent(combo):
    """Parses the 'VariantCombo' string when it contains a ? for
    the native amino acid, given a parent_dict mapping; to be used
    as `df['VariantCombo'].apply(set_parent)` and reassigned
    back to the VariantCombo column.
    """
    # Get the mutations as a list
    muts = combo.split('_')
    
    # For each mutant
    for i, mut in enumerate(muts):
        
        # For the ? amino acids
        if mut[0] == '?':
            
            # Get the position given ?[postition]X
            position = int(mut[1:-1])

            # Get the parent amino acid
            parent_AA = parent_dict[position]

            # Replacethe '?' with this amino acid and replace in the list
            muts[i] = mut.replace('?', parent_AA)
    
    # Return in the same '_' joined format
    return '_'.join(muts)

# Apply the function
seq_df['VariantCombo'] = seq_df['VariantCombo'].apply(set_parent)

# Check
seq_df


single_site_sequencing = seqviz.generate_platemaps(seq_df)
single_site_sequencing


# User input to functional data
func_file = '../data/example_notebook/SSM/201009_indole_rate_data.csv'


# Read in and check column names
func_df = pd.read_csv(func_file, index_col=0).rename(columns={'Plate_name': 'Plate'})
func_df


# Remove 184-D10, which was a parent control but was dead
def set_negative(df_row):
    if (df_row['Position'] == 184) & (df_row['Well'] == 'D10'):
        df_row['Type'] = 'Negative'
        
    return df_row

# Apply the function
func_df = func_df.apply(set_negative, axis=1)

# Check
func_df[(func_df['Position'] == 184) & (func_df['Well'] == 'D10')]


normed = ns.normalize(
    func_df,
    value='Rate (µM/s)',
    to='Type=Parent',
    zero='Type=Negative',
    groupby='Plate',
    prefix='(normalized) '
)

# Examine the 1.2-fold hits from each plate
normed[normed['(normalized) Rate (µM/s)'] > 1.2].groupby('Position')['Well'].count()

Position
105     1
118     2
162     3
184    36
228     8
Name: Well, dtype: int64


# Now the 2-fold
normed[normed['(normalized) Rate (µM/s)'] > 2].groupby('Position')['Well'].count()

Position
162     1
184    11
Name: Well, dtype: int64


# Function data is combined with the AminoAcids_Decouple_Max.csv file
seq_func_data = seqviz.combine_seq_func_data(func_df, evseq_output_dir)
seq_func_data


seqviz.plot_SSM_activities?


simple_seqfunc = seqviz.plot_SSM_activities(
    seq_func_data,
    value='Rate (µM/s)',
)

simple_seqfunc


labeled_seqfunc = seqviz.plot_SSM_activities(
    seq_func_data, 
    value='Rate (µM/s)', 
    known='Type', 
    variant='Variant',
)

labeled_seqfunc


labeled_std_seqfunc = seqviz.plot_SSM_activities(
    seq_func_data, 
    value='Rate (µM/s)', 
    known='Type', 
    variant='Variant', 
    standard='Parent',
)

labeled_std_seqfunc


# Load in the data and view
multi_seq_folder = '../data/example_notebook/Multisite_OutputCounts/'
multi_seq_files = os.path.join(multi_seq_folder, seq_file)
multi_seq_df = pd.read_csv(multi_seq_files)
multi_seq_df


parent_dict = {
    28: 'S',
    31: 'M',
    52: 'Q',
    56: 'L',
}

multi_seq_df['VariantCombo'] = multi_seq_df['VariantCombo'].apply(set_parent)

seqviz.generate_platemaps(multi_seq_df)


from holoviews import opts


seq_df = pd.read_csv(seq_files)

figure2b = seqviz.generate_platemaps(seq_df).opts(
    xticks=0,
    yticks=0,
    xaxis=None,
    yaxis=None,
    labelled=[],
    show_legend=False,
    title='',
    toolbar=None,
).opts({'HeatMap': dict(colorbar=False)})

# These are overlaid in Figure 2b
figure2b


# Add a normalized column to the df for plotting the seq/func data for single sites
orig_SSM_data = ns.Plate(seq_func_data, value_name='Rate (µM/s)')
orig_SSM_data = orig_SSM_data.normalize(
    value='Rate (µM/s)',
    to='Type=Parent',
    update_value=True,
    groupby='Plate',
    prefix='Normalized ',
)

# Convert to DataFrame and rename column
orig_SSM_data = orig_SSM_data.df.rename(columns={'Normalized Rate (µM/s)': 'Normalized Rate'})

# Clean the data for plotting platemaps of all data; copy first
cleaned_seq_func_data = seq_func_data.copy()

# Don't need Flags
cleaned_seq_func_data = cleaned_seq_func_data.drop(columns=['Flags'])

# Drop na columns
cleaned_seq_func_data = cleaned_seq_func_data.dropna()

# Take only wells that match a certain critera
cleaned_seq_func_data = cleaned_seq_func_data.loc[(
    (cleaned_seq_func_data['AlignmentFrequency'] > 0.8) &
    (cleaned_seq_func_data['WellSeqDepth'] > 10) &
    (cleaned_seq_func_data['MutCount'] <= 1)
)]
cleaned_seq_func_data['Position'] = cleaned_seq_func_data['Position'].astype('str', copy=False)

# Create a plate object with ninetysix and add normalized columns
SSM_data = ns.Plate(cleaned_seq_func_data, value_name='Rate (µM/s)')
SSM_data = SSM_data.normalize(
    value='Rate (µM/s)',
    to='Type=Parent',
    update_value=True,
    groupby='Plate',
    prefix='Normalized ',
)

# Convert to DataFrame and rename column
SSM_data = SSM_data.df.rename(columns={'Normalized Rate (µM/s)': 'Normalized Rate'})

# Show the data
SSM_data


import colorcet as cc
count_cmap = cc.coolwarm[int(len(cc.coolwarm)/2)::1]

# set height and width for the two SSM heatmaps
n_rows=8
n_cols=20
height = 250
width = height * n_cols // n_rows

# Get counts of non-control wells (# of actual sequenced unknowns)
SSM_counts = SSM_data.loc[SSM_data['Type']=='Variant'].groupby(by=['Position','AA']).count()

# Rename the count column
SSM_counts = SSM_counts.rename(columns={'Normalized Rate': 'Counts'})

figure3b = hv.HeatMap(
    data=SSM_counts,
    kdims=['AA', 'Position'], 
    vdims=hv.Dimension('Counts', range=(0,14)),
).opts(
    height=height,
    width=width,
    cmap=count_cmap,
    xrotation=45,
    colorbar=True,
    colorbar_opts=dict(
        title='Counts',
    ),
    xlabel='Residue',
    invert_yaxis = True,
    tools=['hover']
)

figure3b


color_levels=ns.viz._center_colormap(SSM_data['Normalized Rate'], 1)

figure3c = hv.HeatMap(
    data=SSM_data.groupby(by=['Position','AA']).mean(), # keeps parent in the calculations
    kdims=['AA', 'Position'], 
    vdims=['Normalized Rate'],
).opts(
    height=height, 
    width=width,
    cmap='coolwarm',
    color_levels=color_levels,
    colorbar=True,
    colorbar_opts=dict(
        title='Normalized Rate',
    ),
    xrotation=45,
    xlabel='Residue',
    invert_yaxis=True,
    tools=['hover'],
)

figure3c


# Just a look at position 105, full SSM activity plot
lib105X_plot = seqviz.plot_SSM_activities(
    orig_SSM_data.loc[(orig_SSM_data['Plate']=='Lib1_105X')],
    value='Normalized Rate',
    known='Type',
    variant='Variant',
    standard='Parent',
)

# Just pull the histogram from the layout (index=1)
figure3d = lib105X_plot[lib105X_plot.keys()[1]]
figure3d = figure3d.opts(color='Counts', cmap=count_cmap, tools=['hover'])

figure3d


# Activity plot is index=0
figure3e = lib105X_plot[lib105X_plot.keys()[0]].opts(
    opts.Bars(
        xlabel='Residue',
        ylabel='Normalized Rate',
    ),
)

figure3e


# from bokeh.io import export_svg
# svg_path=''

# # figure 2b
# for key in figure2b.keys():
    
#     plot=hv.render(figure2b[key].opts(
#         opts.HeatMap(
#             colorbar=False,    
#         ),
#         opts.Points(
#             line_join='round'
#         )
#     ))

#     plot.outline_line_color='black'
#     plot.outline_line_width=5
#     plot.outline_line_alpha=1
#     plot.output_backend = 'svg'
#     plot.min_border=0
#     filename=svg_path+'figure2b_'+str(key)+'.svg'
#     export_svg(plot, filename=filename)


# figure 3b
# plot=hv.render(figure3b)
# plot.output_backend = 'svg'
# filename=svg_path+'figure3b'+'.svg'
# export_svg(plot, filename=filename)

# # figure 3c
# plot=hv.render(figure3c)
# plot.output_backend = 'svg'
# filename=svg_path+'figure3c'+'.svg'
# export_svg(plot, filename=filename)

# # figure 3d
# plot=hv.render(figure3d)
# plot.output_backend = 'svg'
# filename=svg_path+'figure3d'+'.svg'
# export_svg(plot, filename=filename)

# # figure 3e
# plot=hv.render(figure3e)
# plot.output_backend = 'svg'
# filename=svg_path+'figure3e'+'.svg'
# export_svg(plot, filename=filename)


def protabank_exchange(evSeq_func_data=None):
    """Converts data from combine_seq_func_data to an acceptable
    format for submission to protabank
    """
    return NotImplemented

protabank_exchange()

NotImplemented


seqviz.check_distributions(multi_seq_df)


seqviz.check_distributions(seq_df)

	IndexPlate	Plate	Well	VariantCombo	SimpleCombo	VariantsFound	AlignmentFrequency	WellSeqDepth	VariantSequence	Flags
0	DI01	Lib1_105X	A01	?105R	R	1	0.986842	76	IIARTGAGQHGVATATAAALFGMECVIYMGEEDTIRQKLNVERMKL...	NaN
1	DI01	Lib1_105X	A02	?105D	D	1	1.000000	43	IIADTGAGQHGVATATAAALFGMECVIYMGEEDTIRQKLNVERMKL...	NaN
2	DI01	Lib1_105X	A03	?105W	W	1	0.981818	55	IIAWTGAGQHGVATATAAALFGMECVIYMGEEDTIRQKLNVERMKL...	NaN
3	DI01	Lib1_105X	A04	?105E	E	1	1.000000	57	IIAETGAGQHGVATATAAALFGMECVIYMGEEDTIRQKLNVERMKL...	NaN
4	DI01	Lib1_105X	A05	?105P	P	1	0.970588	68	IIAPTGAGQHGVATATAAALFGMECVIYMGEEDTIRQKLNVERMKL...	NaN
...	...	...	...	...	...	...	...	...	...	...
757	DI08	Lib8_301X	H08	?301E	E	1	0.883459	266	CVSGGSNAAGIFYPFIDSGVKLIGVEAGGEGLETGKHAASLLKGKI...	NaN
758	DI08	Lib8_301X	H09	?301P	P	1	0.990291	309	CVSGGSNAAGIFYPFIDSGVKLIGVEAGGEGLETGKHAASLLKGKI...	NaN
759	DI08	Lib8_301X	H10	#DEAD#	#DEAD#	0	0.000000	0	#DEAD#	Too few usable reads
760	DI08	Lib8_301X	H11	G249D_?301L	DL	2	0.995370	216	CVSGGSNAAGIFYPFIDSGVKLIDVEAGGEGLETGKHAASLLKGKI...	NaN
761	DI08	Lib8_301X	H12	?301W	W	1	0.984456	193	CVSGGSNAAGIFYPFIDSGVKLIGVEAGGEGLETGKHAASLLKGKI...	NaN

	IndexPlate	Plate	Well	VariantCombo	SimpleCombo	VariantsFound	AlignmentFrequency	WellSeqDepth	VariantSequence	Flags
0	DI01	Lib1_105X	A01	E105R	R	1	0.986842	76	IIARTGAGQHGVATATAAALFGMECVIYMGEEDTIRQKLNVERMKL...	NaN
1	DI01	Lib1_105X	A02	E105D	D	1	1.000000	43	IIADTGAGQHGVATATAAALFGMECVIYMGEEDTIRQKLNVERMKL...	NaN
2	DI01	Lib1_105X	A03	E105W	W	1	0.981818	55	IIAWTGAGQHGVATATAAALFGMECVIYMGEEDTIRQKLNVERMKL...	NaN
3	DI01	Lib1_105X	A04	E105E	E	1	1.000000	57	IIAETGAGQHGVATATAAALFGMECVIYMGEEDTIRQKLNVERMKL...	NaN
4	DI01	Lib1_105X	A05	E105P	P	1	0.970588	68	IIAPTGAGQHGVATATAAALFGMECVIYMGEEDTIRQKLNVERMKL...	NaN
...	...	...	...	...	...	...	...	...	...	...
757	DI08	Lib8_301X	H08	Y301E	E	1	0.883459	266	CVSGGSNAAGIFYPFIDSGVKLIGVEAGGEGLETGKHAASLLKGKI...	NaN
758	DI08	Lib8_301X	H09	Y301P	P	1	0.990291	309	CVSGGSNAAGIFYPFIDSGVKLIGVEAGGEGLETGKHAASLLKGKI...	NaN
759	DI08	Lib8_301X	H10	#DEAD#	#DEAD#	0	0.000000	0	#DEAD#	Too few usable reads
760	DI08	Lib8_301X	H11	G249D_Y301L	DL	2	0.995370	216	CVSGGSNAAGIFYPFIDSGVKLIDVEAGGEGLETGKHAASLLKGKI...	NaN
761	DI08	Lib8_301X	H12	Y301W	W	1	0.984456	193	CVSGGSNAAGIFYPFIDSGVKLIGVEAGGEGLETGKHAASLLKGKI...	NaN

	Position	Well	Rate (µM/s)	Type	Column	Row	Plate
0	105	A01	0.012341	Variant	1	A	Lib1_105X
1	105	A02	0.419586	Variant	2	A	Lib1_105X
2	105	A03	0.035158	Variant	3	A	Lib1_105X
3	105	A04	0.327427	Variant	4	A	Lib1_105X
4	105	A05	0.029853	Variant	5	A	Lib1_105X
...	...	...	...	...	...	...	...
763	301	H08	0.117740	Variant	8	H	Lib8_301X
764	301	H09	0.034030	Variant	9	H	Lib8_301X
765	301	H10	0.010226	Negative	10	H	Lib8_301X
766	301	H11	-0.011769	Variant	11	H	Lib8_301X
767	301	H12	-0.024868	Variant	12	H	Lib8_301X

	Position	Well	Rate (µM/s)	Type	Column	Row	Plate	IndexPlate	AA	AlignmentFrequency	WellSeqDepth	Flags	MutCount
0	105	A01	0.012341	Variant	1	A	Lib1_105X	DI01	R	0.987013	77.0	NaN	1.0
1	105	A02	0.419586	Variant	2	A	Lib1_105X	DI01	D	1.000000	43.0	NaN	1.0
2	105	A03	0.035158	Variant	3	A	Lib1_105X	DI01	W	0.981818	55.0	NaN	1.0
3	105	A04	0.327427	Variant	4	A	Lib1_105X	DI01	E	1.000000	57.0	NaN	1.0
4	105	A05	0.029853	Variant	5	A	Lib1_105X	DI01	P	0.971014	69.0	NaN	1.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...
763	301	H08	0.117740	Variant	8	H	Lib8_301X	DI08	E	0.882784	273.0	NaN	1.0
764	301	H09	0.034030	Variant	9	H	Lib8_301X	DI08	P	0.990536	317.0	NaN	1.0
765	301	H10	0.010226	Negative	10	H	Lib8_301X	NaN	NaN	NaN	NaN	NaN	NaN
766	301	H11	-0.011769	Variant	11	H	Lib8_301X	DI08	L	0.995516	223.0	NaN	2.0
767	301	H12	-0.024868	Variant	12	H	Lib8_301X	DI08	W	0.984925	199.0	NaN	1.0

	IndexPlate	Plate	Well	VariantCombo	SimpleCombo	VariantsFound	AlignmentFrequency	WellSeqDepth	VariantSequence	Flags
0	DI01	Plate01	A01	?28L_?31M_?52W_?56H	LMWH	4	0.951258	636	ILATMGRLLFERYPETRSLFELPERWIHKHASA	NaN
1	DI01	Plate01	A02	?28M_?31F_?52S_?56V	MFSV	4	0.983478	1150	IMATFGRLLFERYPETRSLFELPERSIHKVASA	NaN
2	DI01	Plate01	A03	#DEAD#	#DEAD#	0	0.000000	9	#DEAD#	Too few usable reads
3	DI01	Plate01	A04	?28Q_?31G_?52E_?56D	QGED	4	0.977408	1682	IQATGGRLLFERYPETRSLFELPEREIHKDASA	NaN
4	DI01	Plate01	A05	?28I_?31I_?52S_?56L	IISL	4	0.978471	929	IIATIGRLLFERYPETRSLFELPERSIHKLASA	NaN
...	...	...	...	...	...	...	...	...	...	...
475	DI05	Plate05	H08	?28V_?31M_?52W_?56L	VMWL	4	0.990984	1220	IVATMGRLLFERYPETRSLFELPERWIHKLASA	NaN
476	DI05	Plate05	H09	?28H_?31E_?52T_?56L	HETL	4	0.983586	792	IHATEGRLLFERYPETRSLFELPERTIHKLASA	NaN
477	DI05	Plate05	H10	?28G_?31M_?52T_?56H	GMTH	4	0.957166	607	IGATMGRLLFERYPETRSLFELPERTIHKHASA	NaN
478	DI05	Plate05	H11	?28E_?31R_?52L_?56A	ERLA	4	0.986164	795	IEATRGRLLFERYPETRSLFELPERLIHKAASA	NaN
479	DI05	Plate05	H12	?28C_?31Q_?52R_?56F	CQRF	4	0.988535	785	ICATQGRLLFERYPETRSLFELPERRIHKFASA	NaN

evSeq

Computational tools for extremely low-cost, massively parallel amplicon-based sequencing of every variant in protein mutant libraries.

Using `evSeq` data¶

Importing and viewing `evSeq` data¶

Interpreting your heatmap:¶

Pairing sequence to function¶

Load in the data you have to pair with your sequencing data¶

Parent control D10 in the 184 library had zero activity¶

Examine the number of "hits", using the thresholds of 1.2-fold and 2-fold improvements.¶

Combine sequence and fitness data for use in exploring the data visually¶

Analyzing single-site-saturation libraries¶

You can do sequence-function plotting a number of different ways.¶

1. The most general use of this function.¶

2. Use known controls to understand the effect of your mutations¶

3. Set the standard/parent to center the colormap¶

`evSeq` for multisite libraries¶

Generating our visualizations¶

Figure 2b¶

Preparing data for figure 3¶

Figure 3b¶

Figure 3c¶

Figure 3d¶

Figure 3e¶

Rendering figures as SVGs¶

Submitting to Protαβank and other databases¶

Miscellaneous visualizations¶

Look at the distribution of read counts/wellseqdepths¶

Using evSeq data¶

Importing and viewing evSeq data¶

Interpreting your heatmap:¶

Pairing sequence to function¶

Load in the data you have to pair with your sequencing data¶

Parent control D10 in the 184 library had zero activity¶

Examine the number of "hits", using the thresholds of 1.2-fold and 2-fold improvements.¶

Combine sequence and fitness data for use in exploring the data visually¶

Analyzing single-site-saturation libraries¶

You can do sequence-function plotting a number of different ways.¶

1. The most general use of this function.¶

2. Use known controls to understand the effect of your mutations¶

3. Set the standard/parent to center the colormap¶

evSeq for multisite libraries¶

Generating our visualizations¶

Figure 2b¶

Preparing data for figure 3¶

Figure 3b¶

Figure 3c¶

Figure 3d¶

Figure 3e¶

Rendering figures as SVGs¶

Submitting to Protαβank and other databases¶

Miscellaneous visualizations¶

Look at the distribution of read counts/wellseqdepths¶

Using `evSeq` data¶

Importing and viewing `evSeq` data¶

`evSeq` for multisite libraries¶