Skip to content

fix ConTextMarkup compiled regex cache issue. #14

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]

.idea
# C extensions
*.so

Expand Down
30 changes: 14 additions & 16 deletions pyConTextNLP/ConTextMarkup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
REG_CLEAN2 = re.compile(r"""\s+""", re.UNICODE)
REG_CLEAN3 = re.compile(r"""\d""", re.UNICODE)

COMPILED_REGEXPRS = {}


NODE_XML_SKEL = \
"""
Expand Down Expand Up @@ -305,22 +305,20 @@ def markItem(self, item, ConTextMode="target", ignoreCase=True):

# See if we have already created a regular expression

if not item.getLiteral() in COMPILED_REGEXPRS:
if not item.getRE():
reg_exp = r"\b{}\b".format(item.getLiteral())
if self.getVerbose():
print("generating regular expression", reg_exp)
else:
reg_exp = item.getRE()
if self.getVerbose():
print("using provided regular expression", reg_exp)
if ignoreCase:
regex = re.compile(reg_exp, re.IGNORECASE|re.UNICODE)
else:
regex = re.compile(reg_exp, re.UNICODE)
COMPILED_REGEXPRS[item.getLiteral()] = regex

if not item.getRE():
reg_exp = r"\b{}\b".format(item.getLiteral())
if self.getVerbose():
print("generating regular expression", reg_exp)
else:
reg_exp = item.getRE()
if self.getVerbose():
print("using provided regular expression", reg_exp)
if ignoreCase:
regex = re.compile(reg_exp, re.IGNORECASE|re.UNICODE)
else:
regex = COMPILED_REGEXPRS[item.getLiteral()]
regex = re.compile(reg_exp, re.UNICODE)

_iter = regex.finditer(self.getText())
terms = []
for i in _iter:
Expand Down
159 changes: 130 additions & 29 deletions pyConTextNLP/itemData.py
Original file line number Diff line number Diff line change
@@ -1,49 +1,70 @@
#Copyright 2010 Brian E. Chapman
# Copyright 2010 Brian E. Chapman
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
A module defining the contextItem class.
"""
import yaml
import urllib.request, urllib.error, urllib.parse
import os

import yaml
import urllib
import csv
from io import StringIO

def _get_fileobj(_file):
if not urllib.parse.urlparse(_file).scheme:
_file = "file://"+_file
return urllib.request.urlopen(_file, data=None)

def get_items(_file):
f0 = _get_fileobj(_file)
context_items = [contextItem((d["Lex"],
d["Type"],
r"%s"%d["Regex"],
d["Direction"])) for d in yaml.load_all(f0)]
f0.close()
return context_items
p = urllib.parse.urlparse(_file)
if not p.scheme:
csvFile = "file://" + _file
f0 = urllib.urlopen(csvFile, 'rU')
return csv.reader(StringIO(f0.read().decode(), newline=None), delimiter="\t"), f0


class contextItem(object):
def get_items(file_str):
file_name = file_str.lower()
if file_name.endswith(".csv") or file_name.endswith(".tsv") or file_name.endswith(".yml"):
if 'http' not in file_str.lower():
pwd = os.getcwd()
if pwd not in file_str:
file_str = os.path.join(pwd, file_str)
if not os.path.exists(file_str):
return contextItem()
if file_name.endswith('csv') or file_name.endswith('tsv'):
return instantiateFromCSVtoitemData(file_str)
elif file_name.endswith('yml'):
return instantiateFromYMLtoitemData(file_str)
else:
return contextItem()
elif "Comments:" in file_str:
return instantiateFromYMLStr(file_str)
elif ',' in file_str:
return instantiateFromCSVStr(file_str, ',')
elif '\t' in file_str:
return instantiateFromCSVStr(file_str, '\t')
else:
print(
"This input format is not supported. It can be either a path of csv, tsv or yaml file, or a string of corresponding file content.")


class contextItem(object):

def __init__(self, args):
self.__literal = args[0]
cs = args[1].split(",")
self.__category = []
for c in cs:
self.__category.append(c.lower().strip())
self.__re = r"%s"%args[2] # I need to figure out how to read this raw string in properly
self.__re = r"%s" % args[2] # I need to figure out how to read this raw string in properly
self.__rule = args[3].lower()

# generate regex from literal if no regex provided
Expand All @@ -53,32 +74,112 @@ def __init__(self, args):
def getLiteral(self):
"""return the literal associated with this item"""
return self.__literal

def getCategory(self):
"""return the list of categories associated with this item"""
return self.__category[:]

def categoryString(self):
"""return the categories as a string delimited by '_'"""
return '_'.join(self.__category)


def isA(self,testCategory):
def isA(self, testCategory):
"""test whether testCategory is one of the categories associated with self"""
try:
return testCategory.lower().strip() in self.__category
except:
for tc in testCategory:
if( tc.lower().strip() in self.__category ):
if (tc.lower().strip() in self.__category):
return True
return False

def getRE(self):
return self.__re

def getRule(self):
return self.__rule

def __str__(self):
txt = """literal<<{0}>>; category<<{1}>>; re<<{2}>>; rule<<{3}>>""".format(
self.__literal,self.__category,self.__re, self.__rule)
self.__literal, self.__category, self.__re, self.__rule)
return txt

def __repr__(self):
return self.__str__()


def instantiateFromCSVStr(content, splitter):
reader = csv.reader(content.split('\n'), delimiter=splitter)
items = contextItem()
for row in reader:
# print(row)
tmp = read_row(row)
if tmp is None:
continue
# tmp = [row[literalColumn], row[categoryColumn],
# row[regexColumn], row[ruleColumn]]
# tmp[2] = r"{0}".format(tmp[2]) # convert the regular expression string into a raw string
item = contextItem(tmp)
items.append(item)
return items


def instantiateFromYMLStr(content):
context_items = [contextItem((d["Lex"],
d["Type"],
r"%s" % d["Regex"],
d["Direction"])) for d in yaml.safe_load_all(content)]
return context_items


def instantiateFromYMLtoitemData(_file):
def get_fileobj(_file):
if not urllib.parse.urlparse(_file).scheme:
_file = "file://" + _file
return urllib.request.urlopen(_file, data=None)

f0 = get_fileobj(_file)
context_items = [contextItem((d["Lex"],
d["Type"],
r"%s" % d["Regex"],
d["Direction"])) for d in yaml.safe_load_all(f0)]
return context_items


def instantiateFromCSVtoitemData(csvFile, encoding='utf-8', headerRows=1, literalColumn=0, categoryColumn=1,
regexColumn=2, ruleColumn=3):
items = contextItem() # itemData to be returned to the user
header = []
reader, f0 = _get_fileobj(csvFile)
# reader = csv.reader(open(csvFile, 'rU'))
# first grab numbe rof specified header rows
for i in range(headerRows):
row = next(reader)
header.append(row)
# now grab each itemData
for row in reader:
# print(row)
tmp = read_row(row)
if tmp is None:
continue
# tmp = [row[literalColumn], row[categoryColumn],
# row[regexColumn], row[ruleColumn]]
# tmp[2] = r"{0}".format(tmp[2]) # convert the regular expression string into a raw string
item = contextItem(tmp)
items.append(item)
f0.close()
return items


def read_row(row):
tmp = []
if len(row) < 2 or row[0].startswith('#'):
return None
tmp.extend([row[0], row[1]])
if len(row) == 3:
tmp.extend([r"{0}".format(row[2]), ''])
elif len(row) == 2:
tmp.extend(['', ''])
else:
tmp.extend([row[2], row[3]])
return tmp
29 changes: 29 additions & 0 deletions pyConTextNLP/tests/test_yml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import os
import unittest

import pyConTextNLP.itemData as itemData
import pyConTextNLP.pyConText as pyConText


class SimpleTestCase(unittest.TestCase):
def test_1(self):
sent1 = 'IMPRESSION: 1. R/O STUDY DEMONSTRATING NO GROSS EVIDENCE OF SIGNIFICANT PULMONARY EMBOLISM.'
print(os.getcwd())
modifiers = itemData.get_items(os.path.join(os.getcwd(),
"../../KB/pneumonia_modifiers.yml"))
targets = itemData.get_items(os.path.join(os.getcwd(),
"../../KB/pneumonia_targets.yml"))
markup = pyConText.ConTextMarkup()
markup.setRawText(sent1.lower())

markup.markItems(modifiers, mode="modifier")
markup.markItems(targets, mode="target")
found = False
for node in markup.nodes(data=True):
if 'r/o' in str(node):
found = True
assert found


if __name__ == '__main__':
unittest.main()