-
-
Save OlegKorn/9ad20b47c6b48c6b98f8159df9eb05d6 to your computer and use it in GitHub Desktop.
Useful Python snippets
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# These are meant to work in both Python 2 and 3, except where noted. | |
# See my useful_pandas_snippets.py for those related to dataframes (such as pickling/`df.to_pickle(save_as)`) | |
# https://gist.github.com/fomightez/ef57387b5d23106fabd4e02dab6819b4 | |
# also see https://gist.github.com/fomightez/324b7446dc08e56c83fa2d7af2b89a33 for examples of my | |
# frequently used Python functions and slight variations for more expanded, modular structures. | |
#argparse | |
# good snippet collection at https://mkaz.tech/code/python-argparse-cookbook/ | |
# positional | |
parser.add_argument("input", help="Name of the file that was \ | |
generated by other program \ | |
when run with your transcriptome of interest.", metavar="INPUT_FILE") | |
# with optional positional | |
parser.add_argument("input", nargs='?', help="**OPTIONAL**Name of the file \ | |
generated by other program \ | |
when run with your transcriptome of interest. Usually, this is \ | |
'"+input_file_name_default+"' &\ | |
if no input file name is provided then this will be used by \ | |
default.", default=input_file_name_default, metavar="INPUT_FILE") | |
# Note see https://stackoverflow.com/questions/18862836/how-to-open-file-using-argparse#comment35222484_18863004 | |
# for why not using `argparse.FileType` approach here. | |
# See | |
# https://stackoverflow.com/questions/4480075/argparse-optional-positional-arguments | |
# and | |
# https://docs.python.org/2/library/argparse.html#nargs for use of `nargs='?'` | |
# to make input and output file names optional. Note that the square brackets | |
# shown in the usage out signify optional according to | |
# https://stackoverflow.com/questions/4480075/argparse-optional-positional-arguments#comment40460395_4480202 | |
# , but because placed under positional I added clarifying text to help | |
# description. | |
# IF MODIFYING THIS SCRIPT FOR USE ELSEWHERE AND DON'T NEED/WANT THE OUTPUT | |
# FILE TO BE OPTIONAL, remove `nargs` (& default?) BUT KEEP WHERE NOT | |
# USING `argparse.FileType` AND USING `with open` AS CONSIDERED MORE PYTHONIC. | |
# With list where won't know the exact size of list, i,e. could be one (Example from `plot_expression_across_chromosomes.py`; see `donut_plot_with_subgroups_from_dataframe.py` for another) | |
parser.add_argument('-chr', '--chrs', action='store', type=str, | |
help="use this flag to limit plotting of the data to particular \ | |
chromosomes or scaffolds you specify immediately following this flag. \ | |
Separate the chromosome or scaffold identifiers by commas, without spaces. \ | |
Example use in a command is `--chrs I,IV,XVI`. \ | |
Default when this optional flag is not called is to plot that data for all \ | |
chromosomes or scaffolds. ") # based on | |
# https://stackoverflow.com/questions/15753701/argparse-option-for-passing-a-list-as-option | |
# ; specifically, https://stackoverflow.com/a/24866869/8508004 | |
#... | |
if args.chrs: | |
if "," in args.chrs: | |
limit_to_chrs = args.chrs.split(',') | |
else: | |
# means only one item | |
limit_to_chrs = [args.chrs] #has to be a list for passing to Pandas `isin()` | |
# with flag parameters | |
parser.add_argument('-sa', '--save_as', action='store', type=str, | |
default= generate_output_file_name(previous_pickled_df), help="Use \ | |
this option to supply a name of \ | |
the file to save for storing produced dataframe. If none is provided, \ | |
the name \'"+generate_output_file_name(previous_pickled_df)+"' will be \ | |
used. To force nothing to be saved, enter \ | |
`-sa no_output` without quotes as output file (ATYPICAL).") | |
# with choices | |
# see https://stackoverflow.com/a/35970231/8508004 | |
# or https://stackoverflow.com/a/15301183/8508004 | |
# or https://stackoverflow.com/questions/40324356/python-argparse-choices-with-a-default-choice | |
parser.add_argument("-og", "--output_grouping", type=str, | |
default= "single", choices=["single", "separate", "both"], | |
help="OPTIONAL: Specify grouping of output with this option. Choose \ | |
`-og single` for one table or dataframe for all categories. Or choose \ | |
`-og separate` for a separate table or dataframe for each category. \ | |
Or specify `-og both` to output both types. \ | |
If this option is not specified, {} will be used.".format("single")) | |
# Now DISFAVORED approach for reading in files. (disfavored because use of `with` favored and not compatible with that. So only use when really in a hurry to scrape something together; see https://stackoverflow.com/questions/18862836/how-to-open-file-using-argparse#comment35222484_18863004) | |
parser.add_argument("MTA", help="Name of file containing data. REQUIRED.", type=argparse.FileType('r'), metavar="FILE") | |
#I would also like trigger help to display if no arguments provided because need at least one input file | |
if len(sys.argv)==1: #from http://stackoverflow.com/questions/4042452/display-help-message-with-python-argparse-when-script-is-called-without-any-argu | |
parser.print_help() | |
sys.exit(1) | |
args = parser.parse_args() | |
the_file = args.MTA | |
# open input file and start reading | |
sys.stderr.write("\nReading input file...") | |
#input_file_stream = open(the_file, "r") # Don't need separate open when use `type=argparse.FileType`. It sets everything up automatically and you will actually cause errors if try to open when already open. | |
for line in thefile: | |
pass | |
# assert | |
assert len(numbers_given_for_start_n_end) == len(aln_ids_in_out_order), ( | |
"The user-supplied list must be equal in length to list of data" | |
"previously supplied as 'aln_ids_in_out_order'.") | |
# Verify that values in dictionary are equal in length using `count` method of Python lists | |
lengths_of_sequences = [len(v) for v in sequence_dict.values()] | |
assert lengths_of_sequences.count( | |
lengths_of_sequences[0]) == len(lengths_of_sequences), "The length " | |
"of all parsed sequences should be the same." # see where | |
# that assertion test involving Ivo van der Wijk's solution from | |
# https://stackoverflow.com/a/3844948/8508004 | |
# Getting hex color codes and RGB values from out of Seaborn color palettes / show colors | |
# based on https://stackoverflow.com/questions/38249454/extract-rgb-or-6-digit-code-from-seaborn-palette | |
# WORKS IN A JUPYTER NOTEBOOK CELL | |
import seaborn as sns | |
#num_shades = 8 | |
#sns.palplot(sns.cubehelix_palette(num_shades)) | |
pal = sns.color_palette("RdBu_r") | |
print(pal.as_hex()) | |
print(pal) | |
sns.palplot(sns.color_palette("RdBu_r")) | |
sns.palplot(sns.diverging_palette(5, 250)) | |
# Check if string can be cast to number and cast (example from `donut_plot_with_subgroups_from_dataframe.py`) | |
def is_number(s): | |
''' | |
check if a string can be cast to a float or numeric (integer). | |
Takes a string. | |
Returns True or False | |
fixed from https://www.pythoncentral.io/how-to-check-if-a-string-is-a-number-in-python-including-unicode/ | |
later noted similar code is at https://code-maven.com/slides/python-programming/is-number | |
''' | |
try: | |
float(s) | |
return True | |
except ValueError: | |
pass | |
try: | |
import unicodedata | |
unicodedata.numeric(s) | |
return True | |
except (TypeError, ValueError): | |
pass | |
return False | |
def cast_to_number(s): | |
''' | |
Cast a string to a float or integer. | |
Tries casting to float first and if that works then it tries casting the | |
string to an integer. (I thought I saw suggestion of that order somewhere | |
when searching for what I used as `is_number()` check but cannot find source | |
right now.) | |
Returns a float, int, or if fails, False. (Where using, it shouldn't ever | |
trigger returning `False` because checked all could be converted first.) | |
based on fixed code from https://www.pythoncentral.io/how-to-check-if-a-string-is-a-number-in-python-including-unicode/ | |
''' | |
try: | |
number = float(s) | |
try: | |
number = int(s) | |
return number | |
except ValueError: | |
pass | |
return number | |
except ValueError: | |
pass | |
try: | |
import unicodedata | |
num = unicodedata.numeric(s) | |
return num | |
except (TypeError, ValueError): | |
pass | |
return False | |
# count frequency | |
def count_frequency_in_list(l): | |
''' | |
takes a list and returns a dictionary of the counts of the items in that | |
list | |
based on https://stackoverflow.com/a/2162045/8508004 | |
''' | |
import collections | |
return collections.Counter(l) | |
clusters_nofeat = clusters_nofeat_df["Acluster(+)"].tolist() + clusters_nofeat_df["Acluster(-)"].tolist() | |
# split up those that contain more than one, i.e., separated by comma, and strip whitespace; | |
# expanded list comprehension from https://stackoverflow.com/a/27886807/8508004 | |
clusters_nofeat = [x.strip() for xs in clusters_nofeat for x in xs.split(',')] | |
# remove the blank ones /empty ones (this is how you remove empty strings from a list) ; it can also be used to remove blank likes from splits on `"\n"` | |
clusters_nofeat = [x for x in clusters_nofeat if x] | |
# remove duplicates | |
clusters_nofeat = set(clusters_nofeat) | |
# remove the dash and everything after to count frequency | |
clusters_nofeat = [x.split("-")[0] for x in clusters_nofeat] | |
# count frequency | |
counts=count_frequency_in_list(clusters_nofeat) | |
clusters_nofeat | |
# next line based on #based on https://stackoverflow.com/questions/2161752/how-to-count-the-frequency-of-the-elements-in-a-list#comment46593992_2162045 | |
# ; otherwise get `Counter()` type | |
dict(counts) | |
most_common,num_most_common = Counter(some_list).most_common(1)[0] # based on | |
# https://stackoverflow.com/a/6987358/8508004 | |
#unique items in list that occur more than two times (or some other number you specify) | |
the_count = Counter(some_list) | |
print ([k for k, v in the_count.items() if v > 2]) # based | |
# on https://stackoverflow.com/a/26773120/8508004 and | |
# https://stackoverflow.com/a/30418498/8508004 , to work in 2.7 and 3 | |
# incrementing count by instance while interating on a list (could be applied to a dataframe column too) | |
# based on https://stackoverflow.com/a/1692428/8508004 | |
from collections import defaultdict | |
l = ['apple','cherry','apple','cherry','cherry','pear'] | |
d = defaultdict(int) | |
for x in l: | |
d[x] += 1 | |
print ("That is {} #{}".format(x,d[x])) | |
print("final counts: {}".format(dict(d))) | |
# count the number of non-overlapping occurrences of a substring | |
num_sequences = file_listing_text.count('.fa') | |
#combinining counting and regular expressions (regex) | |
# count frequency of blocks of Ns in a string (presumably sequence) | |
import re | |
from collections import defaultdict | |
t = "NaaNNNhcTCaaNANANDANNNNNNAANNANNANNNNNNNNANANANNANNNNNN" | |
matches = [] | |
len_match_dict = defaultdict(int) | |
min_number_Ns_in_row_to_collect = 1 | |
pattern_obj = re.compile("N{{{},}}".format(min_number_Ns_in_row_to_collect), re.I) # adpated from | |
# code worked out in `collapse_large_unknown_blocks_in_DNA_sequence.py`, which relied heavily on | |
# https://stackoverflow.com/a/250306/8508004 | |
for m in pattern_obj.finditer(t): | |
len_match_dict[len(m.group())] += 1 | |
matches.append(m.group()) | |
print(len_match_dict) | |
print(collections.Counter(matches)) | |
#copy a list | |
# Important if you'll be itersting on it a modifying it at same time. Modify the copy or iterate on the copy but not both. | |
# Also useful if you want to start a new list with contents of old. If you just do `new_list = old_list`, that just copies | |
# the reference to the list, see https://stackoverflow.com/a/2612815/8508004 . Won't probably be what you want. | |
new_list = old_list.copy() | |
# Split a list into chunks with also collecting any remainder group | |
# from https://stackoverflow.com/a/312464/8508004 | |
def chunks(a_list, n): | |
"""Yield successive n-sized chunks from list.""" | |
for i in range(0, len(a_list), n): | |
yield a_list[i:i+n] | |
print (list(chunks(range(10, 75), 10))) | |
# there is another approach with list comprehension for a string in this gist that is based | |
# on https://stackoverflow.com/a/13673133/8508004 and looks similar to list comprehension one for | |
# lists at https://stackoverflow.com/a/32467096/8508004 | |
# RELATED: chunking with step/stride of certain amount to produce overlap. For example to reproduce process | |
# described in this figure legend: " The data were calculated from 150-base pair windows with 50 base pairs of overlap between adjacent windows and plotted ..." | |
a = "adhaskjhdjkashdajkshdkjashdaslalllslslslsaashdahs" | |
chunk_size = 10 | |
step_size = 5 | |
def chunk_string_with_different_step(string, chunk_size, step_size): | |
"""Return a list of n-sized chunks from string of letters.""" | |
return [string[i:i+chunk_size] for i in range(0, len(string),step_size)] | |
chunk_string_with_different_step(a,chunk_size,step_size) | |
''' | |
#RESULT: | |
['adhaskjhdj', | |
'kjhdjkashd', | |
'kashdajksh', | |
'ajkshdkjas', | |
'dkjashdasl', | |
'hdaslallls', | |
'alllslslsl', | |
'lslslsaash', | |
'saashdahs', | |
'dahs'] | |
''' | |
# AS GENERATOR: | |
b = "adhaskjhdjkashdajkshdkjashdaslalllslslslsaashdahs" | |
def gen_chunk_string_with_different_step(a_list, chunk_size, step_amount): | |
"""Yield successive n-sized chunks from list, stepping /stride by step_amount.""" | |
for i in range(0, len(a_list), step_amount): | |
yield a_list[i:i+chunk_size] | |
print (list(gen_chunk_string_with_different_step(b,chunk_size,step_size))) | |
''' | |
#RESULT: | |
['adhaskjhdj', 'kjhdjkashd', 'kashdajksh', 'ajkshdkjas', 'dkjashdasl', 'hdaslallls', 'alllslslsl', 'lslslsaash', 'saashdahs', 'dahs'] | |
''' | |
# Debugging | |
# Insert the following into a script at the point you'd like to debug to bring up interactive IPyton console | |
# you can query for status of defined variables: | |
from IPython import embed; embed() | |
#dictionary | |
for key in d: | |
pass | |
for key, value in d.items(): | |
pass | |
if key in d: | |
pass | |
# some example dictionary comprehension ( general idea: d = {k:v for k,v in a.items()} ) are at | |
# https://stackoverflow.com/questions/1031851/python-best-way-to-exchange-keys-with-values-in-a-dictionary | |
# in answer to "Python: Best Way to Exchange Keys with Values in a Dictionary?" | |
# merge two dictionaries --> see https://www.geeksforgeeks.org/python-merging-two-dictionaries/ | |
d = {**dict1, **dict2} # or d = (dict2.update(dict1)) | |
# do-while loop (https://stackoverflow.com/a/1662176/8508004) | |
while True: | |
do_something() | |
if condition(): | |
break | |
# get file extension from file name | |
# Now I'd use Pathlib, see https://docs.python.org/3/library/pathlib.html `Path(filename_n_path).suffix` | |
#; for main part using `Path(filename_n_path).stem` see more about Pathlib use in this document under 'Pathlib' below | |
def generate_output_file_name(file_name,suffix_for_saving): | |
''' | |
Takes a file name as an argument and returns string for the name of the | |
output file. The generated name is based on the original file | |
name. | |
Specific example | |
================= | |
Calling function with | |
("sequence.fa", "_col") | |
returns | |
"sequence_col.fa" | |
''' | |
main_part_of_name, file_extension = os.path.splitext( | |
file_name) #from | |
#http://stackoverflow.com/questions/541390/extracting-extension-from-filename-in-python | |
if '.' in file_name: #I don't know if this is needed with the os.path.splitext method but I had it before so left it | |
return main_part_of_name + suffix_for_saving + file_extension | |
else: | |
return file_name + suffix_for_saving + ".fa" | |
# Floor | |
import math | |
x = int(math.floor(10.4)) #outer `int()` typecast in Python 3 is redundant | |
print (x) | |
# outer typecast insures same result (an integer) is returned in both | |
# Python 2 and 3 since Python 2 floor returns a float | |
# Not could get same by typecast to integer alone but use of 'floor' makes it more explicit as to what was sought. | |
# Get HTML / URL in Python 2 or 3 | |
# Getting html originally for just Python 3, adapted from | |
# https://stackoverflow.com/a/17510727/8508004 and then updated from to | |
# handle Python 2 and 3 according to same link. | |
# (snippet with bonus Python 2 and 3 compatible variable unpacking and unicode decoding) | |
url = "http://www.example.org" | |
try: | |
# For Python 3.0 and later | |
from urllib.request import urlopen | |
except ImportError: | |
# Fall back to Python 2's urllib2 | |
from urllib2 import urlopen | |
html = urlopen(url) | |
for line in html.read().splitlines(): | |
#name, chrom_len, *_ = line.strip().split() | |
# that elegant unpack above is based on | |
# https://stackoverflow.com/questions/11371204/unpack-the-first-two-elements-in-list-tuple | |
# , but it won't work in Python 2. From same place, one that works in 2: | |
name, chrom_len = line.strip().split()[:2] | |
chrom_and_length[name.decode( | |
encoding='UTF-8')] = chrom_len.decode(encoding='UTF-8') | |
# generate names for sub-groups automatically, à la `subset_a`, `subset_b`, etc. | |
first_group_suffix = "a" | |
groupings = ["subset"+ chr(ord(first_group_suffix) + x) for x in range(groups_to_make)] | |
# `chr(ord(first_group_suffix) + x)` part of line above based on https://stackoverflow.com/a/2156898/8508004 | |
# greater than or equal and less than or equal | |
assert seq_step_size <= shortest_feature_len, ("problem") | |
# >= for greater than or equal | |
# <= for less than or equal | |
# related use in interval comparison: | |
if 10000 <= number <= 30000: | |
pass | |
# check if any items in two lists are shared (any overlap / overlapping?) | |
bool(set(a) & set(b)) | |
#list comprehension if and if-else conditionals | |
[x+1 if x >= 45 else x+5 for x in l] | |
# FOR JUST IF: | |
#"The if should be after the for (unless it is in an if-else ternary operator) | |
[y for y in a if y not in b] | |
#This would work however: | |
[y if y not in b else other_value for y in a]" | |
#from https://stackoverflow.com/a/15474969/8508004 | |
#My edited version of "just if" variation: | |
[x for x in list_for_order if x in b] | |
# List comprehension to add two items for each initial item in a list | |
# This example adds the number in the list plus the next value in the series | |
# for each item in a list. | |
#based on Games Brainiac comment at https://stackoverflow.com/a/19466238/8508004 and DSM's | |
# comment at https://stackoverflow.com/a/11869360/8508004 | |
l = [1,2,4] | |
[item for x in l for item in [x, x+1]] # results in `[1, 2, 2, 3, 4, 5]` | |
# see related information about `s.shift()` for getting a row and the next one in pandas snippets | |
# plus see some nice list comprehensions in the 'count frequency' section above | |
# for the `count_frequency_in_list()` function | |
# plus there are these: | |
#remove all the blanks, i.e. `''` entries, in the sub-lists, and then | |
element_record[0] = [x for x in element_record[0] if x] | |
element_record[1] = [x for x in element_record[1] if x] | |
#join the contents that remain with commas to make a single string of ids | |
element_record[0] = ", ".join(element_record[0]) | |
element_record[0] = ", ".join(element_record[1]) | |
# -or- in one step, DO BOTH | |
#remove all the blanks, i.e. `''` entries, in the sub-lists, and then | |
#join the contents that remain with commas to make a single string of ids | |
element_record[0] = ", ".join([x for x in element_record[0] if x]) | |
elementr_record[1] = ", ".join([x for x in element_record[1] if x]) | |
# make a copy of a list with a specific item removed | |
first_characters_wo_candidate = [x for x in first_characters if x != basic_tag] #based | |
# on https://stackoverflow.com/a/25004389/8508004 (this will remove ALL occurences); | |
# I couldn't find a way to both make a copy of the list and remove the first instance of that | |
# item using `.remove(item)` using Python 2.7. Always had to copy list and then use remove. Although | |
# potentially this looks like one line if have numpy already imported --> https://stackoverflow.com/a/50313691/8508004 | |
# " one-liner providing both the value of the minimum as well as the first index where it is realized" | |
# from https://coderwall.com/p/a9hvrg/index-of-minimum-element-of-a-list where describes | |
# works because the default order of tuples in Python is lexicographical. | |
# Approach works for `max` (maximum) too. | |
# Needed slight adjusting to not use xrange for Python 3/2 compatibility. | |
l = [33, 788, 1, -14, 78, 11, 32, 11, 78, -1, -14] | |
mn,idx = min( (l[i],i) for i in range(len(l)) ) | |
mn,idx | |
# next() with intertools | |
from itertools import cycle | |
#... | |
colors = (['C0', 'C1', 'C2', 'C3', 'C4', 'C5', 'xkcd:magenta', 'xkcd:orange', | |
'tab:gray','tab:pink']) | |
#... | |
colors = cycle(colors) | |
#... | |
color = next(colors) | |
# works in 2 and 3, see https://stackoverflow.com/questions/5237611/itertools-cycle-next | |
# Dealing with detecting if something was in region/ interval | |
total_feature_levels = max([v["level"] for v in features_dict.values()]) | |
#... | |
loc_intvls_of_features = {( | |
v["start"],v["end"]): k for k,v in features_dict.items()} #dict of the intervals | |
# (keys) of sequence where features(values) are located | |
#... | |
# Use `loc_intvls_of_features` and get list of features where the | |
# intervals contain what corresponds to the current column. Three | |
# categories: at_start, in_mid_interval, at_end. Could sort later | |
# for coordinating stylistic settings but might as well do now | |
# since all information at hand. | |
at_start=[] | |
in_mid_interval = [] | |
at_end = [] | |
for interval,feature in loc_intvls_of_features.items(): | |
if interval[0] == curr_residue_number: | |
at_start.append(feature) | |
elif interval[0] < curr_residue_number < interval[1]: | |
in_mid_interval.append(feature) | |
elif interval[1] == curr_residue_number : | |
at_end.append(feature) | |
# Next if there are contents in the feature lists, cycle through them | |
# stylizing table cells of current column appropriately. | |
for feature in at_start: | |
stylize_start_features(feature, column, features_part) | |
for feature in in_mid_interval: | |
stylize_mid_features(feature, column, features_part) | |
for feature in at_end: | |
stylize_end_features(feature, column, features_part) | |
def overlap_exists(a, b): | |
''' | |
takes two intervals and returns if they overlap. The intervals would | |
be defined by tuples of (start, stop). | |
Examples: | |
overlap_exists([20, 38],[1, 125]) | |
> True | |
overlap_exists([10, 15], [20, 38]) | |
> False | |
overlap_exists([10, 25], [20, 38]) | |
> True | |
overlap_exists([10, 25], [25, 38]) | |
> False | |
based on https://stackoverflow.com/a/2953979/8508004 and fact 0 is same | |
as False and anything else is True. | |
Modified to add the +1 because I want to be inclusive so even one | |
shared basepair is an overlap | |
''' | |
return bool(max(0, (min(a[1], b[1]) - max(a[0], b[0])+1))) | |
# left justify by adding spaces. Really useful when creating multiple sequence alignments to control columns things show up in. | |
# based on https://stackoverflow.com/a/5676676/8508004 | |
mviewlines_dict[i][0] = mviewlines_dict[i][0].ljust(len_longest_id) | |
# Minimum size of string in a list | |
print len(min(lizt_o_strings, key=len)) #based on https://stackoverflow.com/a/7228951/8508004 | |
# operating system/shell dealings | |
# os.remove() to delete a file | |
# make a directory | |
import os, errno | |
try: | |
os.makedirs(directory) | |
except OSError as e: | |
if e.errno != errno.EEXIST: | |
raise | |
# move a file | |
shutil.move("path/to/current/file.foo", "path/to/new/destination/for/file.foo") | |
#for more see [Replacing Bash Scripting with Python](https://github.com/ninjaaron/replacing-bash-scripting-with-python) | |
#see `fnmatch` examples in the useful notebook snippets gist. | |
# see `glob.glob` examples of matching file extensions and filenames, etc., in the useful notebook snippets gist. | |
# Pathlib | |
# in addition to this see `pathlib` examples in useful notebook snippets gist. | |
#Pathlib in Python 2 or 3 example: | |
try: | |
from pathlib import Path | |
except ImportError: | |
from pathlib2 import Path | |
# list all files in a directory | |
[item for item in Path('.').glob('*')] # based on | |
# https://jefftriplett.com/2017/pathlib-is-wonderful/ | |
# list final file extension , see 'Path(filenmae_n_path).suffix' at | |
#https://docs.python.org/3/library/pathlib.html | |
[item.suffix for item in Path('.').glob('*')] | |
# list the final suffixes if there is more than one - see 'Path.suffixes' at | |
#https://docs.python.org/3/library/pathlib.html | |
# main part without the extension is 'Path(filenmae_n_path).stem' | |
# Perform a function or carry out a calculation a certain percent of the time (i.e., with a random frequency) | |
import random | |
if random.random() < 0.8: | |
do_something() | |
# example using it: | |
def get_unique_tag(string, length_of_tags): | |
''' | |
Takes a string and generates a unique tag of provided length | |
related to the provided string. | |
Alphanumerics in tag will be limited to those in the provided | |
string. Although the same order is favored, there | |
is a chance of returning characters out of order using | |
shortuuid. | |
shortuuid from https://github.com/skorokithakis/shortuuid | |
''' | |
import random | |
if random.random() < 0.95: | |
chunk_size = length_of_tags | |
chunks = [string[i:i+chunk_size] for i in range(0, len(string),chunk_size)] #based on | |
# https://stackoverflow.com/a/13673133/8508004 or https://stackoverflow.com/a/9475354/8508004 | |
''' | |
# WRITEN AS A FUNCTION | |
def chunk_string(string, chunk_size): | |
"""Return a list of n-sized chunks from string of letters.""" | |
return [string[i:i+chunk_size] for i in range(0, len(string),chunk_size)] | |
''' | |
# discard chunks too short | |
chunks = [x for x in chunks if len(x)== length_of_tags] | |
import random | |
return random.choice(chunks).lower() | |
else: | |
set_alphabet(string) | |
return uuid()[:length_of_tags].lower() | |
# Pickle Python objects for storage | |
# for simple objects json suggested by Martijn Pieters at https://stackoverflow.com/q/25464295/8508004 | |
# as lighter-weight and more portable than Python's pickling. (Same page has a similar pickling | |
# example actually too under Mike McKerns answer. Good for a list of strings, for example: | |
# Save as json | |
import json | |
with open('filename_list.json', 'w') as f: | |
json.dump(CTD_seqs_fn_list, f) | |
#Read as json | |
import json | |
with open('filename_list.json', 'r') as f: | |
filename_list = json.load(f) | |
# Subset to a random sampling of items in a list , based on https://pynative.com/python-random-sample/ | |
import random | |
genomes = random.sample(population=genomes, k=15) | |
# Read and write to a file | |
# prepare output file for saving so it will be open and ready | |
with open(output_file_name, 'w') as output: | |
# read in the input file | |
with open(input_file_name, 'r') as input: | |
# prepare to give feeback later or allow skipping to certain start | |
lines_processed = 0 | |
# This gff3 doesn't have pertinent lines until line 10 | |
for line in input: | |
lines_processed += 1 | |
# This gff3 doesn't have pertinent lines until line 10 | |
if line.startswith("#") or lines_processed < 9: | |
# Send text to output | |
output.write(line) | |
else: | |
info = line.split("\t") | |
#print(info) | |
info[3] = str(adjust_pos(int(info[3]),ATP6_start_pos, chromosome_length)) | |
info[4] = str(adjust_pos(int(info[4]),ATP6_start_pos, chromosome_length)) | |
#print (info) # ONLY FOR DEBUGGING | |
# Send text to output | |
output.write(("\t").join(info)) | |
# Feedback | |
sys.stderr.write("Positions were changed to match ATP6 as start " | |
"and saved in '{}'.".format(output_file_name)) | |
# Read entire file into memory at once(extra bouns example removes linebreaks to make a a long string) /Read all file at once | |
with open('data.txt', 'r') as myfile: | |
data=myfile.read().replace('\n', '') | |
# REplace text in a file combining much of above approaches | |
script_name = "donut_plot_with_subgroups_from_dataframe.py" | |
def change_original_title(s): | |
''' | |
Change the plot title to the provided text. | |
''' | |
with open(script_name, 'r') as thefile: | |
script=thefile.read() | |
script = script.replace('BREAKDOWN', s) | |
with open(script_name, 'w') as output_file: | |
output_file.write(script) | |
change_original_title("NEW TITLE GOES HERE") | |
# Sort / Sorting | |
# FOR PYTHON FOR GENOMIC DATA SCIENCE I WAS LOOKING TO SORT A REPRESENTATION OF A DICTIONARY BASED ON VALUES | |
# see http://stackoverflow.com/questions/613183/sort-a-python-dictionary-by-value | |
import operator | |
x = {1: 2, 3: 4, 4: 3, 2: 1, 0: 0} | |
sorted_x = sorted(x.items(), key=operator.itemgetter(1)) | |
#Later I found is same without need for operator (and know to work in Python 3).: | |
sorted_keys = sorted(my_dict, key=my_dict.get) #based on https://stackoverflow.com/a/37270275/8508004 | |
# to sort keys from dictionary based on value | |
# see my file `sorting out sorting on attributes within lists using keys.md` for more along these lines | |
# StringIO | |
try: | |
from StringIO import StringIO | |
except ImportError: | |
from io import StringIO | |
df = pd.read_table(StringIO(Input), header=0, delim_whitespace= True) | |
# verified for both Python 2 and 3 | |
# stderr.write | |
import sys | |
sys.stderr.write("\n\nThe dataframe was not stored for use elsewhere " | |
"because `no_output` was specified in place of the output file name.") | |
sys.stderr.write( "\n\nThe dataframe has been saved as a file in a " | |
"manner where other Python programs can access\nthe created " | |
"dataframe (pickled).\n" | |
"The dataframe is stored as '{}'".format(out_name)) | |
# String formatting | |
# my fav resources are https://pyformat.info/ and https://mkaz.blog/code/python-string-format-cookbook/ | |
## Example with named placeholders | |
next_line = ("chr\t-\t{species_code}-{chrom}\t{chrom}\t0" | |
"\t{length}\tblack".format( | |
species_code=species_code,chrom=chrom, length=length)) | |
# see https://mkaz.blog/code/python-string-format-cookbook/ for percentage and Exponent notation, | |
# although I tend to prefer capital E & that can easily be done by replacing the lowercase | |
# one that is shown with upper case like: | |
print("{:.2E}".format(3.1415926)) #results in `3.14E+00` | |
print("{:.3E}".format(602213969921133261473164)) #results in `6.022E+23` | |
print("Half is {:.2%}".format(0.5000000)) #`.2%` limits to two decimals | |
# ternary operator if-or conditional (a if condition else b) for setting a variable | |
direction_string = "positive" if direction > 0 else "negative" | |
# Try except | |
try: | |
return urlopen(url).read() | |
except HTTPError as e: | |
#print e.code | |
#print e.msg | |
return "HTTPError" | |
# write to file | |
# prepare output file for saving so it will be open and ready | |
with open(output_file_name, 'w') as output_file: | |
for indx,(chrom,length) in enumerate(chromosomes_and_length.items()): | |
next_line = ("chr\t-\t{species_code}-{chrom}\t{chrom}\t0" | |
"\t{length}\tblack".format( | |
species_code=species_code,chrom=chrom, length=length)) | |
if indx < (len(chromosomes_and_length)-1): | |
next_line += "\n" # don't add new line character to last line | |
# Send the built line to output | |
output_file.write(next_line) | |
# provide feedback | |
sys.stderr.write( "\n\nThe karyotype file for {} chromosomes has been saved " | |
"as a file named" | |
" '{}'.".format(len(chromosomes_and_length),output_file_name)) | |
# also see 'Read and write to a file' above | |
# decoding and undecoding a string as base64, based on https://stackabuse.com/encoding-and-decoding-base64-strings-in-python/ (note could also use zlib (see https://stackoverflow.com/a/29243206/8508004) but it seemed like make things take up more space/looks messy in the script then the original table. Whereas base64 seemed a little less expansive and cleaner.) | |
# The idea of doing this was to make a table take up less lines of a script when hardcoding what would normally be data in a separate tsv file into a script. So hardcode it in but in less lines | |
import base64 | |
message = wt_bendit_result | |
message_bytes = message.encode('ascii') | |
base64_bytes = base64.b64encode(message_bytes) | |
base64_message = base64_bytes.decode('ascii') | |
#base64_message is the string as based64 | |
#UNDO CONVERSION | |
undecodedbase64 = base64.b64decode(base64_message) | |
# use `undecodedbase64 = undecodedbase64.decode()` after that if need | |
# as a string and using `base64_message = '''dGhpcyBpcyBhIHRlc3Q='''` to | |
# hardcode iinto a script. because with just above that will produce bytes from | |
# `base64_message = '''dGhpcyBpcyBhIHRlc3Q='''` . | |
print(base64_message) | |
``` | |
#Suppressing stdout / stderr when using imported function or subprocess, etc. | |
#Vaguely similar to using `%%capture` to hush noisy output /code in notebooks. | |
#This is very useful when you are importing a function or running internal code as a subprocess and what it is saying in stderr feedback may be wrong or moot because of the way you are processing what function returns. | |
# based on https://stackoverflow.com/a/52442331/8508004 | |
# also see `bendIt_analysis.py` where I do it slightly differently (with `with io.capture_output() as captured:`) | |
# within a python script because running in a juptyer environment. | |
from contextlib import contextmanager,redirect_stderr,redirect_stdout | |
from os import devnull | |
@contextmanager | |
def suppress_stdout_stderr(): | |
""" | |
A context manager that redirects stdout and stderr to devnull. | |
From https://stackoverflow.com/a/52442331/8508004 | |
""" | |
with open(devnull, 'w') as fnull: | |
with redirect_stderr(fnull) as err, redirect_stdout(fnull) as out: | |
yield (err, out) | |
#... | |
with suppress_stdout_stderr(): | |
pm_df = patmatch_results_to_df(result) | |
# Python version of curl is in useful_ipython_to_python_snippets.py at https://gist.github.com/fomightez/ed79e33e97601d839dd550fd224d583c | |
# Python version of checking software to run in shell is installed in environment is in useful_ipython_to_python_snippets.py at https://gist.github.com/fomightez/ed79e33e97601d839dd550fd224d583c |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment