OlegKorn · August 20, 2021 03:49
diff --git a/useful_python_snippets.py b/useful_python_snippets.py
 # These are meant to work in both Python 2 and 3, except where noted.

 # See my useful_pandas_snippets.py for those related to dataframes (such as pickling/`df.to_pickle(save_as)`)
 # https://gist.github.com/fomightez/ef57387b5d23106fabd4e02dab6819b4
 # also see https://gist.github.com/fomightez/324b7446dc08e56c83fa2d7af2b89a33 for examples of my 
 # frequently used Python functions and slight variations for more expanded, modular structures.


 #argparse
 # good snippet collection at https://mkaz.tech/code/python-argparse-cookbook/
 # positional
 parser.add_argument("input", help="Name of the file that was \
    generated by other program \
    when run with your transcriptome of interest.", metavar="INPUT_FILE")

 # with optional positional
 parser.add_argument("input", nargs='?', help="**OPTIONAL**Name of the file \
    generated by other program \
    when run with your transcriptome of interest. Usually, this is \
    '"+input_file_name_default+"' &\
    if no input file name is provided then this will be used by \
    default.", default=input_file_name_default, metavar="INPUT_FILE")
 # Note see https://stackoverflow.com/questions/18862836/how-to-open-file-using-argparse#comment35222484_18863004
 # for why not using `argparse.FileType` approach here.
 # See
 # https://stackoverflow.com/questions/4480075/argparse-optional-positional-arguments 
 # and 
 # https://docs.python.org/2/library/argparse.html#nargs for use of `nargs='?'` 
 # to make input and output file names optional. Note that the square brackets
 # shown in the usage out signify optional according to 
 # https://stackoverflow.com/questions/4480075/argparse-optional-positional-arguments#comment40460395_4480202
 # , but because placed under positional I added clarifying text to help 
 # description.
 # IF MODIFYING THIS SCRIPT FOR USE ELSEWHERE AND DON'T NEED/WANT THE OUTPUT 
 # FILE TO BE OPTIONAL, remove `nargs` (& default?) BUT KEEP WHERE NOT
 # USING `argparse.FileType` AND USING `with open` AS CONSIDERED MORE PYTHONIC.

 # With list where won't know the exact size of list, i,e. could be one (Example from `plot_expression_across_chromosomes.py`; see `donut_plot_with_subgroups_from_dataframe.py` for another)
 parser.add_argument('-chr', '--chrs', action='store', type=str, 
    help="use this flag to limit plotting of the data to particular \
    chromosomes or scaffolds you specify immediately following this flag. \
    Separate the chromosome or scaffold identifiers by commas, without spaces. \
    Example use in a command is `--chrs I,IV,XVI`. \
    Default when this optional flag is not called is to plot that data for all \
    chromosomes or scaffolds. ") # based on
    # https://stackoverflow.com/questions/15753701/argparse-option-for-passing-a-list-as-option
    # ; specifically, https://stackoverflow.com/a/24866869/8508004
 #...
 if args.chrs:
 if "," in args.chrs:
    limit_to_chrs = args.chrs.split(',')
 else:
    # means only one item
    limit_to_chrs = [args.chrs] #has to be a list for passing to Pandas `isin()`

 # with flag parameters
 parser.add_argument('-sa', '--save_as', action='store', type=str, 
    default= generate_output_file_name(previous_pickled_df), help="Use \
    this option to supply a name of \
    the file to save for storing produced dataframe. If none is provided, \
    the name \'"+generate_output_file_name(previous_pickled_df)+"' will be \
    used. To force nothing to be saved, enter \
    `-sa no_output` without quotes as output file (ATYPICAL).") 
 # with choices
 # see https://stackoverflow.com/a/35970231/8508004 
 # or https://stackoverflow.com/a/15301183/8508004
 # or https://stackoverflow.com/questions/40324356/python-argparse-choices-with-a-default-choice
 parser.add_argument("-og", "--output_grouping", type=str, 
        default= "single", choices=["single", "separate", "both"],
        help="OPTIONAL: Specify grouping of output with this option. Choose \
        `-og single` for one table or dataframe for all categories. Or choose \
        `-og separate` for a separate table or dataframe for each category. \
        Or specify `-og both` to output both types. \
        If this option is not specified, {} will be used.".format("single"))

 # Now DISFAVORED approach for reading in files. (disfavored because use of `with` favored and not compatible with that. So only use when really in a hurry to scrape something together; see https://stackoverflow.com/questions/18862836/how-to-open-file-using-argparse#comment35222484_18863004)
 parser.add_argument("MTA", help="Name of file containing data. REQUIRED.", type=argparse.FileType('r'), metavar="FILE")
 #I would also like trigger help to display if no arguments provided because need at least one input file
 if len(sys.argv)==1:    #from http://stackoverflow.com/questions/4042452/display-help-message-with-python-argparse-when-script-is-called-without-any-argu
    parser.print_help()
    sys.exit(1)
 args = parser.parse_args()
 the_file = args.MTA
 # open input file and start reading
 sys.stderr.write("\nReading input file...")
 #input_file_stream = open(the_file, "r") # Don't need separate open when use `type=argparse.FileType`. It sets everything up automatically and you will actually cause errors if try to open when already open.
 for line in thefile:
    pass

  


 # assert
 assert len(numbers_given_for_start_n_end) == len(aln_ids_in_out_order), (
    "The user-supplied list must be equal in length to list of data" 
    "previously supplied as 'aln_ids_in_out_order'.")
 # Verify that values in dictionary are equal in length using `count` method of Python lists
 lengths_of_sequences = [len(v) for v in sequence_dict.values()]
 assert lengths_of_sequences.count(
    lengths_of_sequences[0]) == len(lengths_of_sequences), "The length "
    "of all parsed sequences should be the same." # see where 
    # that assertion test involving Ivo van der Wijk's solution from 
    # https://stackoverflow.com/a/3844948/8508004


 # Getting hex color codes and RGB values from out of Seaborn color palettes / show colors
 # based on https://stackoverflow.com/questions/38249454/extract-rgb-or-6-digit-code-from-seaborn-palette
 # WORKS IN A JUPYTER NOTEBOOK CELL
 import seaborn as sns
 #num_shades = 8
 #sns.palplot(sns.cubehelix_palette(num_shades))
 pal = sns.color_palette("RdBu_r")
 print(pal.as_hex())
 print(pal)
 sns.palplot(sns.color_palette("RdBu_r"))
 sns.palplot(sns.diverging_palette(5, 250))


 # Check if string can be cast to number and cast (example from `donut_plot_with_subgroups_from_dataframe.py`)
 def is_number(s):
    '''
    check if a string can be cast to a float or numeric (integer).

    Takes a string.

    Returns True or False
    fixed from https://www.pythoncentral.io/how-to-check-if-a-string-is-a-number-in-python-including-unicode/
    later noted similar code is at https://code-maven.com/slides/python-programming/is-number
    '''
    try:
        float(s)
        return True
    except ValueError:
        pass
    try:
        import unicodedata
        unicodedata.numeric(s)
        return True
    except (TypeError, ValueError):
        pass
    return False

 def cast_to_number(s):
    '''
    Cast a string to a float or integer. 
    Tries casting to float first and if that works then it tries casting the 
    string to an integer. (I thought I saw suggestion of that order somewhere 
    when searching for what I used as `is_number()` check but cannot find source
    right now.)

    Returns a float, int, or if fails, False. (Where using, it shouldn't ever
    trigger returning `False` because checked all could be converted first.)

    based on fixed code from https://www.pythoncentral.io/how-to-check-if-a-string-is-a-number-in-python-including-unicode/
    '''
    try:
        number = float(s)
        try:
            number = int(s)
            return number
        except ValueError:
            pass
        return number
    except ValueError:
        pass
    try:
        import unicodedata
        num = unicodedata.numeric(s)
        return num
    except (TypeError, ValueError):
        pass
    return False





 # count frequency
 def count_frequency_in_list(l):
    '''
    takes a list and returns a dictionary of the counts of the items in that 
    list

    based on https://stackoverflow.com/a/2162045/8508004
    '''
    import collections
    return collections.Counter(l)

 clusters_nofeat = clusters_nofeat_df["Acluster(+)"].tolist() + clusters_nofeat_df["Acluster(-)"].tolist()
 # split up those that contain more than one, i.e., separated by comma, and strip whitespace;
 # expanded list comprehension from https://stackoverflow.com/a/27886807/8508004
 clusters_nofeat = [x.strip() for xs in clusters_nofeat for x in xs.split(',')]
 # remove the blank ones /empty ones (this is how you remove empty strings from a list) ; it can also be used to remove blank likes from splits on `"\n"`
 clusters_nofeat = [x for x in clusters_nofeat if x]
 # remove duplicates
 clusters_nofeat = set(clusters_nofeat)
 # remove the dash and everything after to count frequency
 clusters_nofeat = [x.split("-")[0] for x in clusters_nofeat]
 # count frequency
 counts=count_frequency_in_list(clusters_nofeat)
 clusters_nofeat
 # next line based on #based on https://stackoverflow.com/questions/2161752/how-to-count-the-frequency-of-the-elements-in-a-list#comment46593992_2162045
 # ; otherwise get `Counter()` type
 dict(counts) 


 most_common,num_most_common = Counter(some_list).most_common(1)[0] # based on
 # https://stackoverflow.com/a/6987358/8508004

 #unique items in list that occur more than two times (or some other number you specify)
 the_count = Counter(some_list)
 print ([k for k, v in the_count.items() if v > 2]) # based
 # on https://stackoverflow.com/a/26773120/8508004 and
 # https://stackoverflow.com/a/30418498/8508004 , to work in 2.7 and 3


 # incrementing count by instance while interating on a list (could be applied to a dataframe column too)
 # based on https://stackoverflow.com/a/1692428/8508004
 from collections import defaultdict
 l = ['apple','cherry','apple','cherry','cherry','pear']
 d = defaultdict(int)
 for x in l:
    d[x] += 1
    print ("That is {} #{}".format(x,d[x]))
 print("final counts: {}".format(dict(d)))

 # count the number of non-overlapping occurrences of a substring 
 num_sequences = file_listing_text.count('.fa')



 #combinining counting and regular expressions (regex)
 # count frequency of blocks of Ns in a string (presumably sequence)
 import re
 from collections import defaultdict
 t = "NaaNNNhcTCaaNANANDANNNNNNAANNANNANNNNNNNNANANANNANNNNNN"
 matches = []
 len_match_dict = defaultdict(int)
 min_number_Ns_in_row_to_collect = 1
 pattern_obj = re.compile("N{{{},}}".format(min_number_Ns_in_row_to_collect), re.I)  # adpated from
 # code worked out in `collapse_large_unknown_blocks_in_DNA_sequence.py`, which relied heavily on
 # https://stackoverflow.com/a/250306/8508004
 for m in pattern_obj.finditer(t):
    len_match_dict[len(m.group())] += 1
    matches.append(m.group())
 print(len_match_dict)
 print(collections.Counter(matches))






 #copy a list
 # Important if you'll be itersting on it a modifying it at same time. Modify the copy or iterate on the copy but not both.
 # Also useful if you want to start a new list with contents of old. If you just do `new_list = old_list`, that just copies 
 # the reference to the list, see https://stackoverflow.com/a/2612815/8508004 . Won't probably be what you want.
 new_list = old_list.copy()

 # Split a list into chunks with also collecting any remainder group
 # from https://stackoverflow.com/a/312464/8508004

 def chunks(a_list, n):
    """Yield successive n-sized chunks from list."""
    for i in range(0, len(a_list), n):
        yield a_list[i:i+n]


 print (list(chunks(range(10, 75), 10)))

 # there is another approach with list comprehension for a string in this gist that is based 
 # on https://stackoverflow.com/a/13673133/8508004 and looks similar to list comprehension one for
 # lists at https://stackoverflow.com/a/32467096/8508004

 # RELATED: chunking with step/stride of certain amount to produce overlap. For example to reproduce process
 # described in this figure legend: " The data were calculated from 150-base pair windows with 50 base pairs of overlap between adjacent windows and plotted ..."
 a = "adhaskjhdjkashdajkshdkjashdaslalllslslslsaashdahs"
 chunk_size = 10
 step_size = 5
 def chunk_string_with_different_step(string, chunk_size, step_size):
    """Return a list of n-sized chunks from string of letters."""
    return [string[i:i+chunk_size] for i in range(0, len(string),step_size)] 
 chunk_string_with_different_step(a,chunk_size,step_size)
 '''
 #RESULT:
 ['adhaskjhdj',
 'kjhdjkashd',
 'kashdajksh',
 'ajkshdkjas',
 'dkjashdasl',
 'hdaslallls',
 'alllslslsl',
 'lslslsaash',
 'saashdahs',
 'dahs']
 '''
 # AS GENERATOR:
 b = "adhaskjhdjkashdajkshdkjashdaslalllslslslsaashdahs"
 def gen_chunk_string_with_different_step(a_list, chunk_size, step_amount):
    """Yield successive n-sized chunks from list, stepping /stride by step_amount."""
    for i in range(0, len(a_list), step_amount):
        yield a_list[i:i+chunk_size]
 print (list(gen_chunk_string_with_different_step(b,chunk_size,step_size)))
 '''
 #RESULT:
 ['adhaskjhdj', 'kjhdjkashd', 'kashdajksh', 'ajkshdkjas', 'dkjashdasl', 'hdaslallls', 'alllslslsl', 'lslslsaash', 'saashdahs', 'dahs']
 '''






 # Debugging
 # Insert the following into a script at the point you'd like to debug to bring up interactive IPyton console 
 # you can query for status of defined variables:
 from IPython import embed; embed()







 #dictionary
 for key in d:
    pass

 for key, value in d.items():
    pass

 if key in d:
    pass

 # some example dictionary comprehension ( general idea: d = {k:v for k,v in a.items()}  ) are at 
 # https://stackoverflow.com/questions/1031851/python-best-way-to-exchange-keys-with-values-in-a-dictionary
 # in answer to "Python: Best Way to Exchange Keys with Values in a Dictionary?"

 # merge two dictionaries --> see https://www.geeksforgeeks.org/python-merging-two-dictionaries/
 d = {**dict1, **dict2}    # or d = (dict2.update(dict1))


 # do-while loop (https://stackoverflow.com/a/1662176/8508004)
 while True:
    do_something()
    if condition():
        break

        
 # get file extension from file name
 # Now I'd use Pathlib, see https://docs.python.org/3/library/pathlib.html `Path(filename_n_path).suffix`
 #; for main part using `Path(filename_n_path).stem` see more about Pathlib use in this document under 'Pathlib' below
 def generate_output_file_name(file_name,suffix_for_saving):
    '''
    Takes a file name as an argument and returns string for the name of the
    output file. The generated name is based on the original file
    name.


    Specific example
    =================
    Calling function with
        ("sequence.fa", "_col")
    returns
        "sequence_col.fa"
    '''
    main_part_of_name, file_extension = os.path.splitext(
        file_name) #from 
    #http://stackoverflow.com/questions/541390/extracting-extension-from-filename-in-python
    if '.' in file_name:  #I don't know if this is needed with the os.path.splitext method but I had it before so left it
        return main_part_of_name + suffix_for_saving  + file_extension
    else:
        return file_name + suffix_for_saving + ".fa"



 # Floor
 import math
 x = int(math.floor(10.4)) #outer `int()` typecast in Python 3 is redundant
 print (x)
 # outer typecast insures same result (an integer) is returned in both 
 # Python 2 and 3 since Python 2 floor returns a float
 # Not could get same by typecast to integer alone but use of 'floor' makes it more explicit as to what was sought.

 # Get HTML / URL in Python 2 or 3
 # Getting html originally for just Python 3, adapted from 
 # https://stackoverflow.com/a/17510727/8508004 and then updated from to 
 # handle Python 2 and 3 according to same link.
 # (snippet with bonus Python 2 and 3 compatible variable unpacking and unicode decoding)
 url = "http://www.example.org"
 try:
    # For Python 3.0 and later
    from urllib.request import urlopen
 except ImportError:
    # Fall back to Python 2's urllib2
    from urllib2 import urlopen
 html = urlopen(url)
 for line in html.read().splitlines():
    #name, chrom_len, *_ = line.strip().split()
    # that elegant unpack above is based on 
    # https://stackoverflow.com/questions/11371204/unpack-the-first-two-elements-in-list-tuple
    # , but it won't work in Python 2. From same place, one that works in 2:
    name, chrom_len = line.strip().split()[:2]
    chrom_and_length[name.decode(
        encoding='UTF-8')] = chrom_len.decode(encoding='UTF-8')

 # generate names for sub-groups automatically,  à la `subset_a`, `subset_b`, etc.
 first_group_suffix = "a"
 groupings = ["subset"+ chr(ord(first_group_suffix) + x) for x in range(groups_to_make)]
 # `chr(ord(first_group_suffix) + x)` part of line above based on https://stackoverflow.com/a/2156898/8508004

    
    
 # greater than or equal and less than or equal
 assert seq_step_size <= shortest_feature_len, ("problem")
 # >= for greater than or equal
 # <= for less than or equal
 # related use in interval comparison:
 if 10000 <= number <= 30000:
    pass


 # check if any items in two lists are shared (any overlap / overlapping?)
 bool(set(a) & set(b))





 #list comprehension if and if-else conditionals
 [x+1 if x >= 45 else x+5 for x in l]
 # FOR JUST IF:
 #"The if should be after the for (unless it is in an if-else ternary operator)
 [y for y in a if y not in b]
 #This would work however:
 [y if y not in b else other_value for y in a]"
 #from https://stackoverflow.com/a/15474969/8508004
 #My edited version of "just if" variation:
 [x for x in list_for_order if x in b]

 # List comprehension to add two items for each initial item in a list
 # This example adds the number in the list plus the next value in the series
 # for each item in a list.
 #based on Games Brainiac comment at https://stackoverflow.com/a/19466238/8508004 and DSM's 
 # comment at https://stackoverflow.com/a/11869360/8508004
 l = [1,2,4]
 [item for x in l for item in [x, x+1]]   # results in `[1, 2, 2, 3, 4, 5]`
 # see related information about `s.shift()` for getting a row and the next one in pandas snippets

 # plus see some nice list comprehensions in the 'count frequency' section above
 # for the `count_frequency_in_list()` function
 # plus there are these:
 #remove all the blanks, i.e. `''` entries, in the sub-lists, and then
 element_record[0] = [x for x in element_record[0] if x]
 element_record[1] = [x for x in element_record[1] if x]
 #join the contents that remain with commas to make a single string of ids
 element_record[0] = ", ".join(element_record[0])
 element_record[0] = ", ".join(element_record[1])
 # -or- in one step, DO BOTH
 #remove all the blanks, i.e. `''` entries, in the sub-lists, and then
 #join the contents that remain with commas to make a single string of ids
 element_record[0] = ", ".join([x for x in element_record[0] if x])
 elementr_record[1] = ", ".join([x for x in element_record[1] if x])
 # make a copy of a list with a specific item removed
 first_characters_wo_candidate = [x for x in first_characters if x != basic_tag] #based
 # on https://stackoverflow.com/a/25004389/8508004 (this will remove ALL occurences);
 # I couldn't find a way to both make a copy of the list and remove the first instance of that 
 # item using `.remove(item)` using Python 2.7. Always had to copy list and then use remove. Although
 # potentially this looks like one line if have numpy already imported --> https://stackoverflow.com/a/50313691/8508004
 


 # " one-liner providing both the value of the minimum as well as the first index where it is realized"
 # from https://coderwall.com/p/a9hvrg/index-of-minimum-element-of-a-list where describes
 # works because the default order of tuples in Python is lexicographical.
 # Approach works for `max` (maximum) too.
 # Needed slight adjusting to not use xrange for Python 3/2 compatibility.
 l = [33, 788, 1, -14, 78, 11, 32, 11, 78, -1, -14]
 mn,idx = min( (l[i],i) for i in range(len(l)) )
 mn,idx


 # next() with intertools
 from itertools import cycle
 #...
 colors = (['C0', 'C1', 'C2', 'C3', 'C4', 'C5', 'xkcd:magenta', 'xkcd:orange', 
  'tab:gray','tab:pink'])
  #...
 colors = cycle(colors)
 #...
 color = next(colors)
 # works in 2 and 3, see https://stackoverflow.com/questions/5237611/itertools-cycle-next



 # Dealing with detecting if something was in region/ interval
 total_feature_levels = max([v["level"] for v in features_dict.values()])
 #...
 loc_intvls_of_features = {(
 v["start"],v["end"]): k for k,v in features_dict.items()} #dict of the intervals
 # (keys) of sequence where features(values) are located
 #...
 # Use `loc_intvls_of_features` and get list of features where the
 # intervals contain what corresponds to the current column. Three
 # categories: at_start, in_mid_interval, at_end. Could sort later
 # for coordinating stylistic settings but might as well do now
 # since all information at hand.
 at_start=[]
 in_mid_interval = []
 at_end = []
 for interval,feature in loc_intvls_of_features.items():
    if interval[0] == curr_residue_number:
        at_start.append(feature)
    elif interval[0] < curr_residue_number < interval[1]:
        in_mid_interval.append(feature)
    elif interval[1] == curr_residue_number :
        at_end.append(feature)
 # Next if there are contents in the feature lists, cycle through them
 # stylizing table cells of current column appropriately.
 for feature in at_start:
    stylize_start_features(feature, column, features_part)
 for feature in in_mid_interval:
    stylize_mid_features(feature, column, features_part)
 for feature in at_end:
            stylize_end_features(feature, column, features_part)

 def overlap_exists(a, b):
    '''
    takes two intervals and returns if they overlap. The intervals would
    be defined by tuples of (start, stop).

    Examples:
    overlap_exists([20, 38],[1, 125])
    > True

    overlap_exists([10, 15], [20, 38])
    > False

    overlap_exists([10, 25], [20, 38])
    > True

    overlap_exists([10, 25], [25, 38])
    > False

    based on https://stackoverflow.com/a/2953979/8508004 and fact 0 is same
    as False and anything else is True.
    Modified to add the +1 because I want to be inclusive so even one 
    shared basepair is an overlap
    '''
    return bool(max(0, (min(a[1], b[1]) - max(a[0], b[0])+1)))

    
    
 # left justify by adding spaces. Really useful when creating multiple sequence alignments to control columns things show up in.
 # based on https://stackoverflow.com/a/5676676/8508004
 mviewlines_dict[i][0] = mviewlines_dict[i][0].ljust(len_longest_id)
    
    
 # Minimum size of string in a list
 print len(min(lizt_o_strings, key=len)) #based on https://stackoverflow.com/a/7228951/8508004
    
    
 # operating system/shell dealings    
 # os.remove() to delete a file

 # make a directory
 import os, errno

 try:
    os.makedirs(directory)
 except OSError as e:
    if e.errno != errno.EEXIST:
        raise
 # move a file
 shutil.move("path/to/current/file.foo", "path/to/new/destination/for/file.foo")
 #for more see [Replacing Bash Scripting with Python](https://github.com/ninjaaron/replacing-bash-scripting-with-python)

 #see `fnmatch` examples in the useful notebook snippets gist.
 # see `glob.glob` examples of matching file extensions and filenames, etc., in the useful notebook snippets gist.





 # Pathlib
 # in addition to this see `pathlib` examples in useful notebook snippets gist.
 #Pathlib in Python 2 or 3 example:
 try:
    from pathlib import Path
 except ImportError:
    from pathlib2 import Path
 # list all files in a directory 
 [item for item in Path('.').glob('*')] # based on 
 # https://jefftriplett.com/2017/pathlib-is-wonderful/
 # list final file extension , see 'Path(filenmae_n_path).suffix' at 
 #https://docs.python.org/3/library/pathlib.html
 [item.suffix for item in Path('.').glob('*')]
 # list the final suffixes if there is more than one - see 'Path.suffixes' at 
 #https://docs.python.org/3/library/pathlib.html
 # main part without the extension is 'Path(filenmae_n_path).stem'
    
    
    
    
 # Perform a function or carry out a calculation a certain percent of the time (i.e., with a random frequency)
 import random
 if random.random() < 0.8:
    do_something()
 # example using it:
 def get_unique_tag(string, length_of_tags):
    '''
    Takes a string and generates a unique tag of provided length
    related to the provided string.
    Alphanumerics in tag will be limited to those in the provided
    string. Although the same order is favored, there
    is a chance of returning characters out of order using
    shortuuid.
    shortuuid from https://github.com/skorokithakis/shortuuid
    '''
    import random
    if random.random() < 0.95:
        chunk_size = length_of_tags
        chunks = [string[i:i+chunk_size] for i in range(0, len(string),chunk_size)] #based on
        # https://stackoverflow.com/a/13673133/8508004 or https://stackoverflow.com/a/9475354/8508004
        '''
        # WRITEN AS A FUNCTION
        def chunk_string(string, chunk_size):
            """Return a list of n-sized chunks from string of letters."""
            return [string[i:i+chunk_size] for i in range(0, len(string),chunk_size)] 
        '''
        # discard chunks too short
        chunks = [x for x in chunks if len(x)== length_of_tags]
        import random
        return random.choice(chunks).lower()
    else:
        set_alphabet(string)
        return uuid()[:length_of_tags].lower()  
    
 # Pickle Python objects for storage
 # for simple objects json suggested by Martijn Pieters at https://stackoverflow.com/q/25464295/8508004
 # as lighter-weight and more portable than Python's pickling. (Same page has a similar pickling
 # example actually too under Mike McKerns answer. Good for a list of strings, for example:
 # Save as json
 import json
 with open('filename_list.json', 'w') as f:
    json.dump(CTD_seqs_fn_list, f)

 #Read as json
 import json
 with open('filename_list.json', 'r') as f:
    filename_list = json.load(f)


 # Subset to a random sampling of items in a list , based on https://pynative.com/python-random-sample/
 import random
 genomes = random.sample(population=genomes, k=15)

    
 # Read and write to a file
 # prepare output file for saving so it will be open and ready
 with open(output_file_name, 'w') as output:

    # read in the input file
    with open(input_file_name, 'r') as input:
        # prepare to give feeback later or allow skipping to certain start
        lines_processed = 0
        # This gff3 doesn't have pertinent lines until line 10

        for line in input:
            lines_processed += 1
            # This gff3 doesn't have pertinent lines until line 10
            if line.startswith("#") or lines_processed < 9:
                # Send text to output
                output.write(line)
            else:
                
                info = line.split("\t")
                #print(info)
                info[3] = str(adjust_pos(int(info[3]),ATP6_start_pos, chromosome_length))
                info[4] = str(adjust_pos(int(info[4]),ATP6_start_pos, chromosome_length))
                #print (info) # ONLY FOR DEBUGGING
                
                # Send text to output
                output.write(("\t").join(info))
 # Feedback
 sys.stderr.write("Positions were changed to match ATP6 as start "
    "and saved in '{}'.".format(output_file_name))
  
 # Read entire file into memory at once(extra bouns example removes linebreaks to make a a long string) /Read all file at once
 with open('data.txt', 'r') as myfile:
    data=myfile.read().replace('\n', '')
    
 # REplace text in a file combining much of above approaches
 script_name = "donut_plot_with_subgroups_from_dataframe.py"
 def change_original_title(s):
    '''
    Change the plot title to the provided text.
    '''
    with open(script_name, 'r') as thefile:
        script=thefile.read()
    script = script.replace('BREAKDOWN', s)
    with open(script_name, 'w') as output_file:
        output_file.write(script)
 change_original_title("NEW TITLE GOES HERE") 
    

    
    
    
 # Sort / Sorting    
 # FOR PYTHON FOR GENOMIC DATA SCIENCE I WAS LOOKING TO SORT A REPRESENTATION OF A DICTIONARY BASED ON VALUES
 # see http://stackoverflow.com/questions/613183/sort-a-python-dictionary-by-value
 import operator
 x = {1: 2, 3: 4, 4: 3, 2: 1, 0: 0}
 sorted_x = sorted(x.items(), key=operator.itemgetter(1))

 #Later I found is same without need for operator (and know to work in Python 3).:
 sorted_keys = sorted(my_dict, key=my_dict.get) #based on https://stackoverflow.com/a/37270275/8508004
 # to sort keys from dictionary based on value  
 # see my file `sorting out sorting on attributes within lists using keys.md` for more along these lines
   


 
    
    
 # StringIO 

 try:
    from StringIO import StringIO
 except ImportError:
    from io import StringIO
    
 df = pd.read_table(StringIO(Input), header=0, delim_whitespace= True)
 # verified for both Python 2 and 3


 # stderr.write
 import sys
 sys.stderr.write("\n\nThe dataframe was not stored for use elsewhere "
            "because `no_output` was specified in place of the output file name.")
 sys.stderr.write( "\n\nThe dataframe has been saved as a file in a "
                "manner where other Python programs can access\nthe created "
                "dataframe (pickled).\n"
                "The dataframe is stored as '{}'".format(out_name))



 # String formatting
 # my fav resources are https://pyformat.info/  and   https://mkaz.blog/code/python-string-format-cookbook/
 ## Example with named placeholders
 next_line = ("chr\t-\t{species_code}-{chrom}\t{chrom}\t0"
                "\t{length}\tblack".format(
                species_code=species_code,chrom=chrom, length=length))
 # see https://mkaz.blog/code/python-string-format-cookbook/ for percentage and Exponent notation, 
 # although I tend to prefer capital E & that can easily be done by replacing the lowercase 
 # one that is shown with upper case like:
 print("{:.2E}".format(3.1415926)) #results in `3.14E+00`
 print("{:.3E}".format(602213969921133261473164)) #results in `6.022E+23`
 print("Half is {:.2%}".format(0.5000000)) #`.2%` limits to two decimals


 # ternary operator if-or conditional (a if condition else b) for setting a variable
 direction_string = "positive" if direction > 0 else "negative"



 # Try except
 try:
  return urlopen(url).read()
 except HTTPError as e:
  #print e.code
  #print e.msg
  return "HTTPError"


 # write to file
 # prepare output file for saving so it will be open and ready
 with open(output_file_name, 'w') as output_file:
    for indx,(chrom,length) in enumerate(chromosomes_and_length.items()):
        next_line = ("chr\t-\t{species_code}-{chrom}\t{chrom}\t0"
            "\t{length}\tblack".format(
            species_code=species_code,chrom=chrom, length=length))
        if indx < (len(chromosomes_and_length)-1):
            next_line += "\n" # don't add new line character to last line
        # Send the built line to output
        output_file.write(next_line)
 # provide feedback
 sys.stderr.write( "\n\nThe karyotype file for {} chromosomes has been saved "
        "as a file named"
        " '{}'.".format(len(chromosomes_and_length),output_file_name))

 # also see 'Read and write to a file' above


 # decoding and undecoding a string as base64, based on https://stackabuse.com/encoding-and-decoding-base64-strings-in-python/ (note could also use zlib (see https://stackoverflow.com/a/29243206/8508004) but it seemed like make things take up more space/looks messy in the script then the original table. Whereas base64 seemed a little less expansive and cleaner.)
 # The idea of doing this was to make a table take up less lines of a script when hardcoding what would normally be data in a separate tsv file into a script. So hardcode it in but in less lines
 import base64
 message = wt_bendit_result
 message_bytes = message.encode('ascii')
 base64_bytes = base64.b64encode(message_bytes)
 base64_message = base64_bytes.decode('ascii')
 #base64_message is the string as based64

 #UNDO CONVERSION
 undecodedbase64 = base64.b64decode(base64_message)
 # use `undecodedbase64 = undecodedbase64.decode()` after that if need 
 # as a string and using `base64_message = '''dGhpcyBpcyBhIHRlc3Q='''` to
 # hardcode iinto a script. because with just above that will produce bytes from
 # `base64_message = '''dGhpcyBpcyBhIHRlc3Q='''` .
 print(base64_message)
 ```


 #Suppressing stdout / stderr when using imported function or subprocess, etc.
 #Vaguely similar to using `%%capture` to hush noisy output /code in notebooks.
 #This is very useful when you are importing a function or running internal code as a subprocess and what it is saying in stderr feedback may be wrong or moot because of the way you are processing what function returns.
 # based on https://stackoverflow.com/a/52442331/8508004
 # also see `bendIt_analysis.py` where I do it slightly differently (with `with io.capture_output() as captured:`) 
 # within a python script because running in a juptyer environment. 

 from contextlib import contextmanager,redirect_stderr,redirect_stdout
 from os import devnull

 @contextmanager
 def suppress_stdout_stderr():
    """
    A context manager that redirects stdout and stderr to devnull.
    From https://stackoverflow.com/a/52442331/8508004
    """
    with open(devnull, 'w') as fnull:
        with redirect_stderr(fnull) as err, redirect_stdout(fnull) as out:
            yield (err, out)

 #...
 with suppress_stdout_stderr():
    pm_df = patmatch_results_to_df(result)


 # Python version of curl is in useful_ipython_to_python_snippets.py at https://gist.github.com/fomightez/ed79e33e97601d839dd550fd224d583c

 # Python version of checking software to run in shell is installed in environment is in useful_ipython_to_python_snippets.py at https://gist.github.com/fomightez/ed79e33e97601d839dd550fd224d583c