Skip to content

Commit acb7036

Browse files
committed
Merge branch 'develop'
2 parents 6ee736d + 08b88ab commit acb7036

40 files changed

+45613
-43971
lines changed

README.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,3 @@ Tested with Travis CI.
66
[![Build Status](https://travis-ci.org/HazyResearch/deepdive.svg?branch=master)](https://travis-ci.org/HazyResearch/deepdive)
77

88
### [Visit The DeepDive Website](http://deepdive.stanford.edu)
9-

ddlib/Makefile

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
do:
2+
cat test.json | python ./without_ddlib.py > a
3+
cat test.json | python ./with_ddlib.py > b
4+
diff a b
5+
rm a
6+
rm b

ddlib/ddlib/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from dd import *

ddlib/ddlib/dd.py

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
import sys
2+
import collections
3+
4+
Word = collections.namedtuple('Word', ['begin_char_offset', 'end_char_offset', 'word', 'lemma', 'pos', 'ner', 'dep_par', 'dep_label'])
5+
Span = collections.namedtuple('Span', ['begin_word_id', 'length'])
6+
Sequence = collections.namedtuple('Sequence', ['is_inversed', 'elements'])
7+
DepEdge = collections.namedtuple('DepEdge', ['word1', 'word2', 'label', 'is_bottom_up'])
8+
9+
def unpack_words(input_dict, character_offset_begin=None, character_offset_end=None, lemma=None,
10+
pos=None, ner = None, words = None, dep_graph = None, dep_graph_parser = lambda x: x.split('\t')):
11+
12+
array_character_offset_begin = input_dict[character_offset_begin] if character_offset_begin != None else ()
13+
array_character_offset_end = input_dict[character_offset_end] if character_offset_end != None else ()
14+
array_lemma = input_dict[lemma] if lemma != None else ()
15+
array_pos = input_dict[pos] if pos != None else ()
16+
array_ner = input_dict[ner] if ner != None else ()
17+
array_words = input_dict[words] if words != None else ()
18+
dep_graph = input_dict[dep_graph] if dep_graph != None else ()
19+
20+
dep_tree = {}
21+
for path in dep_graph:
22+
(parent, label, child) = dep_graph_parser(path)
23+
parent, child = int(parent), int(child)
24+
dep_tree[child] = {"parent":parent, "label":label}
25+
if parent not in dep_tree: dep_tree[parent] = {"parent":-1, "label":"ROOT"}
26+
27+
ziped_tags = map(None, array_character_offset_begin, array_character_offset_end, array_lemma,
28+
array_pos, array_ner, array_words)
29+
wordobjs = []
30+
for i in range(0,len(ziped_tags)):
31+
if i not in dep_tree : dep_tree[i] = {"parent":-1, "label":"ROOT"}
32+
wordobjs.append(Word(begin_char_offset=ziped_tags[i][0],
33+
end_char_offset=ziped_tags[i][1], lemma=ziped_tags[i][2], word=ziped_tags[i][3], pos=ziped_tags[i][4],
34+
ner=ziped_tags[i][5], dep_par=dep_tree[i]["parent"], dep_label=dep_tree[i]["label"]))
35+
return wordobjs
36+
37+
38+
def log(obj):
39+
"""Print the string form of an object to STDERR.
40+
41+
Args:
42+
obj: The object that the user wants to log to STDERR.
43+
"""
44+
sys.stderr.write(obj.__str__() + "\n")
45+
46+
def materialize_span(words, span, func=lambda x:x):
47+
"""Given a sequence of objects and a span, return the subsequence that corresponds to the span.
48+
49+
Args:
50+
words: A sequence of objects.
51+
span: A Span namedtuple
52+
func: Optional function that will be applied to each element in the result subsequence.
53+
"""
54+
return map(func, words[span.begin_word_id:(span.begin_word_id+span.length)])
55+
56+
def _fe_seq_between_words(words, begin_idx, end_idx, func=lambda x:x):
57+
if begin_idx < end_idx:
58+
return Sequence(elements=map(func, words[begin_idx+1:end_idx]), is_inversed=False)
59+
else:
60+
return Sequence(elements=map(func, words[end_idx+1:begin_idx]), is_inversed=True)
61+
62+
63+
def tokens_between_spans(words, span1, span2, func=lambda x:x):
64+
"""Given a sequence of objects and two spans, return the subsequence that is between these spans.
65+
66+
Args:
67+
words: A sequence of objects.
68+
span1: A Span namedtuple
69+
span2: A Span namedtuple
70+
func: Optional function that will be applied to each element in the result subsequence.
71+
72+
Returns:
73+
A Sequence namedtuple between these two spans. The "is_inversed" label is set
74+
to be True if span1 is *AFTER* span 2.
75+
76+
"""
77+
if span1.begin_word_id < span2.begin_word_id:
78+
return _fe_seq_between_words(words, span1.begin_word_id+span1.length-1, span2.begin_word_id, func)
79+
else:
80+
return _fe_seq_between_words(words, span1.begin_word_id, span2.begin_word_id+span2.length-1, func)
81+
82+
def _path_to_root(words, word_idx):
83+
rs = []
84+
c_word_idx = word_idx
85+
while True:
86+
rs.append(words[c_word_idx])
87+
if words[c_word_idx].dep_par == -1 or words[c_word_idx].dep_par == c_word_idx:
88+
break
89+
c_word_idx = words[c_word_idx].dep_par
90+
return rs
91+
92+
def dep_path_between_words(words, begin_idx, end_idx):
93+
"""Given a sequence of Word objects and two indices, return the sequence of Edges
94+
corresponding to the dependency path between these two words.
95+
96+
Args:
97+
words: A sequence of Word objects.
98+
span1: A word index
99+
span2: A word index
100+
101+
Returns:
102+
An Array of Edge objects, each of which corresponds to one edge on the dependency path.
103+
"""
104+
path_to_root1 = _path_to_root(words, begin_idx)
105+
path_to_root2 = _path_to_root(words, end_idx)
106+
common = set(path_to_root1) & set(path_to_root2)
107+
#if len(common) == 0:
108+
# raise Exception('Dep Path Must be Wrong: No Common Element Between Word %d & %d.' % (begin_idx, end_idx))
109+
path = []
110+
for word in path_to_root1:
111+
if word in common: break
112+
path.append(DepEdge(word1=word, word2=words[word.dep_par], label=word.dep_label, is_bottom_up=True))
113+
path_right = []
114+
for word in path_to_root2:
115+
if word in common: break
116+
path_right.append(DepEdge(word1=words[word.dep_par], word2=word, label=word.dep_label, is_bottom_up=False))
117+
for e in reversed(path_right):
118+
path.append(e)
119+
return path
120+

0 commit comments

Comments
 (0)