Source code for tde.readers.gold_reader

#!/usr/bin/env python
"""Gold object contains a vad, a word alignment and a phone alignmenet

Each alignement can be represented either as an interval tree or a dictionnary,
depending on the usage (interval tree is fast for interval retrieval/ overal


import os
import pandas as pd
import intervaltree

from collections import defaultdict

[docs]class Gold(): def __init__(self, vad_path=None, wrd_path=None, phn_path=None): """Object representing the gold. Contains the VAD,the word alignement and the phone alignment. The alignments can be stored as interval trees or as dictionnaries. The interval tree of the silences can also be stored. Attributes ---------- :param vad_path: string, path to the vad :param_wrd_path: string, path to the word alignment :param phn_path: string, path to the phone alignment :param boundaries: tuples of two dicts, each dict contains the database filename as key, and for each file, contains the onset boundaries and offset boundaries :param phones: a dict {fname: intervaltree} which returns the interval tree of the gold phones for each file :param words: a dict {fname: intervaltree} which returns the interval tree of the gold words for each file """ # paths self.vad_path = vad_path self.wrd_path = wrd_path self.phn_path = phn_path # golds self.boundaries = None self.phones = None self.words = None # read alignments self.words, _, self.ix2wrd, self.wrd2ix, self.boundaries = ( self.read_gold_intervalTree(self.wrd_path, "word")) if "SIL" in self.wrd2ix: print("WARNING: Word alignement contains silences, those will be counted as word by the evaluation.\n" "You should keep them in the phone alignment but remove them from the word alignment.") self.phones, _, self.ix2phn, self.phn2ix, _ = ( self.read_gold_intervalTree(self.phn_path, "phone")) # self.boundaries = self.get_boundaries()
[docs] def read_gold_dict(self, gold_path): """Read the gold phoneme file with fields: speaker/file start end annotation Returns a dict with the file/speaker as a key and the following structure: gold['speaker'] = [{'start': list(...)}, {'end': list(...), 'symbol': list(...)}] """ if not os.path.isfile(gold_path): raise ValueError('{}: File Not Found'.format(gold_path)) # Read phone alignment using pandas df = pd.read_table( gold_path, sep=' ', header=None, encoding='utf8', names=['file', 'start', 'end', 'symbol']) # sort the data by file and onsets and round the onsets/offsets df = df.sort_values(by=['file', 'start']) df['start'] = df['start'].round(decimals=4) df['end'] = df['end'].round(decimals=4) # # number of phones tokens in corpus # number_read_symbols = len(df['symbol']) # get the lexicon and translate to as integers symbols = list(set(df['symbol'])) symbol2ix = {v: k for k, v in enumerate(symbols)} ix2symbols = dict((v, k) for k, v in symbol2ix.items()) df['symbol'] = df['symbol'].map(symbol2ix) # timestamps in gold (start, end) must be in acending order for fast # search gold = {} verification_num_symbols = 0 for k in df['file'].unique(): start = df[df['file'] == k]['start'].values end = df[df['file'] == k]['end'].values symbols = df[df['file'] == k]['symbol'].values # check onsets/offsets are ordered # assert not any(np.greater_equal.outer(start[:-1] - start[1:], 0)), 'start in annotation file is not odered!!!' # assert not any(np.greater_equal.outer(end[:-1] - end[1:], 0)), 'end in annotation file is not odered!!!' gold[k] = { 'start': list(start), 'end': list(end), 'symbol': list(symbols)} verification_num_symbols += len(gold[k]['symbol']) # logging.debug("%d symbolss read from %s (%d returned)", number_read_symbols, # gold_path, verification_num_symbols) return gold, ix2symbols, symbol2ix
[docs] def read_gold_intervalTree(self, gold_path, symbol_type=None): '''Read the gold alignment and build an interval tree (O( log(n) )). After that, take each found interval, search for its overlaps (O( log(n) + m), m being the number of results found), and check if we want to keep each interval. Parameters ---------- - gold : the path to the gold alignment - symbol_type: string, "word" or "phone", if "word", don't keep the silences if some are found if "phone", keep them and raise warning if none are found Returns ------- - gold: a dict {fname: intervaltree} which returns the interval tree of the gold phones for each file - ix2symbols: a dict that returns the symbols for each index of encoding (to compute the ned, we assign numbers to symbols) Raises ------ ValueError - If the alignement is not well formated UserWarning - If the phone alignement does not contain silences AssertionError - If an interval contains an offset lower than the onset ''' if not os.path.isfile(gold_path): raise ValueError('{}: File Not Found'.format(gold_path)) # read the gold and create a list of tuples for each filename, then create an interval # tree from this list of tuple. intervals = defaultdict(list) gold = dict() symbols = set() # create a set of all the available symbols transcription = dict() # create dict that returns the transcription for an interval boundaries_up = defaultdict(set) boundaries_down = defaultdict(set) # keep flag to check that phone alignement contains silences sil_flag = True with open(gold_path, 'r') as fin: ali = fin.readlines() for line in ali: try: fname, on, off, symbol = line.strip('\n').split(' ') except: raise ValueError( 'format of alignement should be:\n' '\tfilename onset offset symbol\n' 'but alignment contains wrongly formated line:\n' '{}'.format(line)) # check timestamps are in correct order assert float(off) > float(on), ("timestamps are not" " correct\n {}".format(line)) # If word alignement, don't keep silences, else, keep them. if symbol_type == "word" and symbol == "SIL": continue elif symbol_type == "phone" and symbol == "SIL": sil_flag = True transcription[(fname, float(on), float(off))] = symbol symbols.add(symbol) intervals[fname].append((float(on), float(off), symbol)) boundaries_up[fname].add(float(off)) boundaries_down[fname].add(float(on)) # for each filename, create an interval tree for fname in intervals: gold[fname] = intervaltree.IntervalTree.from_tuples( intervals[fname]) # raise warning if phone alignment doesn't contain silences if symbol_type == "phone" and not sil_flag: raise UserWarning("phone alignment does not contain" " silences, which are necessary for correct" " evaluation.") # create a mapping index -> symbols for the phones symbol2ix = {v: k for k, v in enumerate(list(symbols))} ix2symbols = dict((v, k) for k, v in symbol2ix.items()) return (gold, transcription, ix2symbols, symbol2ix, (boundaries_up, boundaries_down))
[docs] def get_intervals(fname, on, off, gold, transcription): """ Given a filename and an interval, retrieve the list of covered intervals, and their transcription. This is done using, which is supposed to work in O(log(n) + m), n being the number of intervals and m the number of covered intervals. Parameters ---------- fname: str, name of the speaker on: float, onset of the interval off: float, offset of the interval gold: dict of intervaltree, contains all gold phones transcription: dict of tuples, contains the transcription of each interval """ def overlap(a, b, interval): ov = (min(b, interval[1]) - max(a, interval[0])) \ / (interval[1] - interval[0]) time = min(b, interval[1]) - max(a, interval[0]) return ov, time # search interval tree _cov_int = gold[fname].overlap(on, off) cov_int = set() # set of kept intervals cov_trs = [] # retrieved transcription # check each interval to see if we keep it or not. # In particular, check if found interval contains # more than 30 ms or more than 50% of phone. for interval in _cov_int: int_ov, time = overlap(on, off, interval) if round(int_ov, 4) >= 0.50 or round(time, 4) >= 0.03: cov_trs.append( (interval[0], interval[1], transcription[(fname, interval[0], interval[1])])) cov_int.add((interval[0], interval[1])) # finally, sort the transcription by onsets, because intervaltree # doesn't necessarily return the intervals in order... cov_trs.sort() trs = [t for b, e, t in cov_trs] return cov_int, trs
[docs] def get_silence_intervals(self, vad): ''' Compute interval tree of silences ''' pass