Question

I have a hierarchical tree in Newick format, such as:

(A:0.556705,(B:0.251059,C:0.251059):0.305646):0.556705;

Parentheses represent tree topology (clade structure/groupings) and numbers represent branch lengths (distances), such that:

enter image description here

I need to collapse clades (sub-trees) whose average distances to the tips (terminal nodes/tips/leaves) are less than a given value, x. Such that input and collapsed output are in Newick format. For example, if x was 0.29, then B and C in the above collapsed into BC, we get something like:

(A:0.556705,BC:0.556705):0.556705;

enter image description here

Is there a simple way to programmatically achieve this collapsing given any Newick tree (e.g. in Python)?

Was it helpful?

Solution

This quick and dirty code snippet seems to work on your minimal tree, but I would need more sample data to check it with more complex trees:

#! /usr/bin/python3

class Tree:
    def __init__ (self, tokens):
        self.children = []
        while tokens:
            self.children.append ( (tokens [1], float (tokens [0] ) ) )
            tokens = tokens [2:]

    def __repr__ (self):
        return '<{}>'.format (self.children)

    def pprint (self, indent = 0):
        prefix = '  ' * indent
        for child, dist in self.children:
            if isinstance (child, Tree):
                print (prefix, dist)
                child.pprint (indent + 1)
            else:
                print (prefix, child, dist)

    def collapse (self, limit):
        self.children = [ (child.collapse (limit) [0], dist)
            if isinstance (child, Tree)
            else (child, dist) for child, dist in self.children]
        avg = sum (dist for _, dist in self.children) / len (self.children)
        if avg > limit:
            return (self, 0)
        else:
            if any (isinstance (child, Tree) for child in self.children):
                print ('Would like to collapse, but cannot, as at least one child is a tree.')
                return (self, 0)
            return (''.join (child for child, _ in self.children), 0)


def parse (tree):
    stack = []
    buff = ''
    while True:
        c = tree [0]
        if c == ';': break
        tree = tree [1:]
        if c == '(':
            stack.insert (0, '(')
            continue
        if c == ')':
            if buff: stack.insert (0, buff)
            buff = ''
            popped = ''
            tokens = []
            while True:
                token = stack [0]
                stack = stack [1:]
                if token == '(': break
                tokens.append (token)
            stack.insert (0, Tree (tokens) )
            continue
        if c in ':,':
            if buff: stack.insert (0, buff)
            buff = ''
            continue
        buff += c
    if buff: stack.insert (0, buff)
    return Tree (stack)

t = parse ('(A:0.556705,(B:0.251059,C:0.251059):0.305646):0.556705;')
t.pprint ()
t.collapse (.3)
print ()
t.pprint ()

The uncollapsed tree is:

 0.556705
   0.305646
     C 0.251059
     B 0.251059
   A 0.556705

The collapsed tree by .3 is:

 0.556705
   CB 0.305646
   A 0.556705

The collapsed tree by 1.0 is:

CBA 0.556705

The collapsed tree by 0.1 is:

 0.556705
   0.305646
     C 0.251059
     B 0.251059
   A 0.556705

Here another cleaner version of the code (which produces nerwick notation output):

NOTA BENE: The input tree must be parenthesized.

#! /usr/bin/python3

class Tree:
    def __init__ (self, weight, children):
        self.weight = weight
        self.children = children [:]

    def distances (self, curDistance = .0, acc = None):
        if acc is None: acc = []
        for child in self.children:
            child.distances (self.weight + curDistance, acc)
        return acc

    def collapse (self, limit):
        self.children = [child.collapse (limit) for child in self.children]
        distances = self.distances (-self.weight)
        avg = sum (distances) / len (distances)
        if avg > limit: return self
        return Node (self.weight, ''.join (self.descendants () ) )

    def descendants (self):
        descendants = []
        for child in self.children:
            descendants.extend (child.descendants () )
        return descendants

    def __repr__ (self):
        return '({}):{}'.format (','.join (str (child) for child in self.children), self.weight)

class Node:
    def __init__ (self, weight, name):
        self.weight = weight
        self.name = name

    def distances (self, curDistance, acc):
        acc.append (curDistance + self.weight)

    def collapse (self, limit):
        return self

    def descendants (self):
        return [self.name]

    def __repr__ (self):
        return '{}:{}'.format (self.name, self.weight)

class Stack (list):
    def pop (self):
        e = self [0]
        del self [0]
        return e

    def push (self, e):
        self.insert (0, e)

def parse (tree):
    buff = ''
    stack = Stack ()
    while True:
        c = tree [0]
        if c == ';': break
        tree = tree [1:]

        if c == '(':
            stack.push (c)
            continue

        if c in ':,':
            if buff: stack.push (buff)
            buff = ''
            continue

        if c == ')':
            if buff: stack.push (buff)
            buff = ''
            popped = ''
            children = []
            while True:
                weight = stack.pop ()
                if weight == '(': break
                weight = float (weight)
                child = stack.pop ()
                if isinstance (child, Tree):
                    child.weight = weight
                else:
                    child = Node (weight, child)
                children.append (child)
            stack.push (Tree (0, children) )
            continue

        buff += c

    return stack.pop ()

t = parse ('((A:0.9,(B:0.2,C:0.3):0.3,(E:0.05,F:0.08):0.1):0.6);')
print ('Input tree is {}'.format (t) )
for limit in range (1, 6):
    limit = limit / 10
    print ('Collapsed by {} is {}'.format (limit, t.collapse (limit) ) )

Output is (with modified input):

Input tree is (((F:0.08,E:0.05):0.1,(C:0.3,B:0.2):0.3,A:0.9):0.6):0
Collapsed by 0.1 is ((FE:0.1,(C:0.3,B:0.2):0.3,A:0.9):0.6):0
Collapsed by 0.2 is ((FE:0.1,(C:0.3,B:0.2):0.3,A:0.9):0.6):0
Collapsed by 0.3 is ((FE:0.1,CB:0.3,A:0.9):0.6):0
Collapsed by 0.4 is ((FE:0.1,CB:0.3,A:0.9):0.6):0
Collapsed by 0.5 is (FECBA:0.6):0

OTHER TIPS

Here maybe a more robust parser which eats all examples here.

#! /usr/bin/python3

class Tree:
    def __init__ (self):
        self.name = ''
        self.length = None
        self.children = []

    def __repr__ (self):
        return '({}){}{}'.format (
                ','.join (str (child) for child in self.children),
                self.name,
                ':{}'.format (self.length) if self.length is not None else '')

class Node:
    def __init__ (self, name = ''):
        self.name = name
        self.length = None

    def __repr__ (self):
        return '{}{}'.format (self.name, ':{}'.format (self.length) if self.length is not None else '')

class Stack (list):
    def push (self, e):
        self.append (e)

    def pop (self):
        e = self [-1]
        del self [-1]
        return e

    def peek (self):
        return self [-1]

class UnexpectedSymbolException (Exception): pass

def parseNameOrLength (stack, buff):
        if stack.peek () == ':':
            try: length = float (buff)
            except ValueError:
                raise ValueError ('Non-numeric length "{}" at position {}.'.format (buff, pos) )
            stack.pop ()
            stack.peek ().length = length
        elif buff:
            stack.peek ().name = buff

def parse (tree):
    stack = Stack ()
    stack.push (Tree () )
    buff = ''
    pos = -1
    tree = tree.strip ()
    while True:
        pos += 1
        c = tree [0]
        tree = tree [1:].strip ()
        if tree: la = tree [0]

        if c == '(':
            if buff:
                raise UnexpectedSymbolException (
                        'Unexpected symbol {} at position {}.'.format (c, pos) )
            if la == '(': stack.push (Tree () )
            else: stack.push (Node () )
            continue

        if c == ')':
            parseNameOrLength (stack, buff)
            buff = ''

            child = stack.pop ()
            stack.peek ().children.append (child)
            continue

        if c in ',;':
            parseNameOrLength (stack, buff)
            buff = ''

            if c == ';': return stack.pop ()

            child = stack.pop ()
            stack.peek ().children.append (child)
            if la == '(': stack.push (Tree () )
            else: stack.push (Node () )
            continue

        if c == ':':
            if stack.peek () == ':':
                raise UnexpectedSymbolException (
                        'Unexpected symbol {} at position {}.'.format (c, pos) )
            stack.peek ().name = buff
            stack.push (':')
            buff = ''
            continue

        buff += c

tests = '''(,,(,));
        (A,B,(C,D));
        (A,B,(C,D)E)F;
        (:0.1,:0.2,(:0.3,:0.4):0.5);
        (:0.1,:0.2,(:0.3,:0.4):0.5):0.0;
        (A:0.1,B:0.2,(C:0.3,D:0.4):0.5);
        (A:0.1,B:0.2,(C:0.3,D:0.4)E:0.5)F;
        ((B:0.2,(C:0.3,D:0.4)E:0.5)F:0.1)A;'''.split ('\n')

for test in tests: print (parse (test) )
Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top