summaryrefslogtreecommitdiff
path: root/textproc/py-wordnet/files/concordance.py
blob: 89caef0036ab7fc8aa060a142506b0216b2227d9 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# some accessing of the semantic concordance data for wordnet 1.6
# by Des Berry, berry@ais.it

import string, os
from wordnet import binarySearchFile

# Sample entries in the 'taglist' file
#   ordinary%1:18:01:: 1 br-a01:78,1;86,1;88,4
#   ordered%5:00:00:organized:01 2 br-j23:6,14;13,32;66,12
# where the general form is:
#   lemma%ss_type:lex_filenum:lex_id:head_word:head_id sense_number
[location_list]
#   location_list: filename:sent_num,word_num[;sent_num,word_num...]

ss_type = ("NOUN", "VERB", "ADJECTIVE", "ADVERB", "ADJECTIVE SATELLITE")

# given a sentence number (and the contents of a semantic concordance file)
# return a string of words as the sentence
def find_sentence(snum, msg):
  str = "<s snum=%s>" % snum
  s = string.find(msg, str)
  if s < 0:
    return "<Unknown>"
  s = s + len(str)
  sentence = ""
  tag = ""
  while 1:
    if msg[s] == '\n':
      s = s + 1
    n = string.find(msg, '<', s)
    if n < 0:
      break
    if n - s != 0:
      if tag == "w" and msg[s] != "'" and len(sentence) > 0: # word form
        sentence = sentence + " "
      sentence = sentence + msg[s:n]
    e = string.find(msg, '>', n)
    if e < 0:
      break
    tag = msg[n+1]
    if tag == "/": #check for ending sentence
      if msg[n+2] == 's':
        #end of sentence
        break
    s = e + 1
  return sentence

# given a taglist sense (one line of the tagfile) and where to find the tagfile (root)
# return a tuple of
#  symset type ('1' .. '5')
#  sense (numeric character string)
#  list of sentences (constructed from the taglist)
def tagsentence(tag, root):
  s = string.find(tag, '%')
  sentence = []
  type = tag[s+1]
  c = s
  for i in range(0,4):
    c = string.find(tag, ':', c + 1)
  c = string.find(tag, ' ', c + 1)
  sense = tag[c+1]
  c = c + 3
  while 1:
    d = string.find(tag, ' ', c) # file separator
    if d < 0:
      loclist = tag[c:]
    else:
      loclist = tag[c:d]
      c = d + 1

    e = string.find(loclist, ':')
    filename = loclist[:e]
    fh = open(root + filename, "rb")
    msg = fh.read()
    fh.close()

    while 1:
      e = e + 1
      f = string.find(loclist, ';', e)
      if f < 0:
        sent_word = loclist[e:]
      else:
        sent_word = loclist[e:f]
        e = f

      g = string.find(sent_word, ',')
      sent = sent_word[:g]

      sentence.append(find_sentence(sent, msg))

      if f < 0:
        break

    if d < 0:
      break
  return (type, sense, sentence)

# given a word to search for and where to find the files (root)
# displays the information
# This could be changed to display in different ways!
def sentences(word, root):
  cache = {}
  file = open(root + "taglist", "rb")
  key = word + "%"
  keylen = len(key)
  binarySearchFile(file, key + " ", cache, 10)
  print("Word '%s'" % word)
  while 1:
    line = file.readline()
    if line[:keylen] != key:
      break
    type, sense, sentence = tagsentence(line, root + "tagfiles/")
    print(ss_type[string.atoi(type) - 1], sense)
    for sent in sentence:
      print(sent)


def _test(word, corpus, base):
  print(corpus)
  sentences("ordinary", base + corpus + "/")

if __name__ == '__main__':
  base = "C:/win16/dict/semcor/"
  word = "ordinary"
  _test(word, "brown1", base)
  _test(word, "brown2", base)
  _test(word, "brownv", base)