T.R | Title | User | Personal Name | Date | Lines |
---|
448.1 | source for word program | AVANTI::OSMAN | | Mon Mar 03 1986 18:45 | 223 |
| /*********************************************************************************************************************************/
/* Created 31-JAN-1986 17:03:19 by VAX-11 SDL V2.1 Source: 31-JAN-1986 17:02:06 USER$7:[OSMAN.WORDS]WORD_STRUCTURE.SDL;5 */
/*********************************************************************************************************************************/
/*** MODULE words ***/
struct words {
long int len; /* length of word */
char (*adr)[]; /* address of word */
} ;
#define max_n_words 100000
int dump_flag = 0;
int show_ignore_flag = 0;
int n_words = 0;
int q_mask = (1 << ('q' - 'a'));
int u_mask = (1 << ('u' - 'a'));
int vowel_mask =
(1 << ('a' - 'a')) +
(1 << ('e' - 'a')) +
(1 << ('i' - 'a')) +
(1 << ('o' - 'a')) +
(1 << ('u' - 'a')) +
(1 << ('y' - 'a'));
int max_lets_used = 0;
struct words words[max_n_words];
int word_index = 0;
int bits_lets_used = 0;
unsigned char (* p_lets_used)[4] = &bits_lets_used;
int bits_lets_loaded = 0;
int n_words_used = 0;
int words_used[26];
int i, j;
int n_lets_loaded = 0;
int btab[26] = {-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1};
char bits[256];
main ()
{
for (i=0;i<=255;i++)
{
bits[i] = 0;
for (j=0;j<=7;j++) bits[i] += (i >> j) & 1;
}
{
int fd;
#define mrc 1000
char record[mrc];
int nbytes, n_discarded = 0;
if ((fd = open ("words:sorted.txt", 0)) == -1)
{printf ("Failed to open file\n");exit (0);};
n_words = 0;
while (1)
{
nbytes = read (fd, record, mrc);
if (n_words >= max_n_words)
{printf ("Only using first %d words\n", max_n_words);
close (fd);break;}
if (nbytes <= 0) {close (fd);break;}
nbytes--;
for (i = 0; i < nbytes; i++)if((record[i] >= 'a' && record[i] <= 'z')
|| record[i] == ' ')
continue; else
{
if (show_ignore_flag)
printf ("Ignoring \"%.*s\"\n", nbytes, record);
goto next;
}
for (i=0;i<nbytes;i++)
{
if (record[i] == ' ')
{
nbytes =i;
goto found_space;
}
}
found_space:
if (nbytes == 1) if (record[0] != 'a' && record[0] != 'i')
{
n_discarded++;
goto next;
}
words[n_words].len = 0;
for (i = 0; i < nbytes; i++)
{
int let_mask = 1 << (record[i] - 'a');
if ((words[n_words].len & let_mask) != 0)
{
n_discarded++;
goto next;
}
words[n_words].len |= let_mask;
}
if ((words[n_words].len & vowel_mask) == 0)
{
n_discarded++;
goto next;
}
if ((words[n_words].len & u_mask) == 0 &&
(words[n_words].len & q_mask) != 0)
{
n_discarded++;
goto next;
}
if (n_words > 0)
if (words[n_words].len == words[n_words - 1].len)
{
n_discarded++;
goto next;
}
if ((words[n_words].adr = calloc (1, nbytes + 1)) == 0)
{printf ("Failed to allocate word\n"); exit (0);};
strncpy (words[n_words].adr, record, nbytes);
if (dump_flag) printf (" %s\n", words[n_words].adr);
if (n_words > 0)
{
if ((*words[n_words].adr)[0] != (*words[n_words-1].adr)[0])
{
btab[(*words[n_words].adr)[0] - 'a'] = n_words;
bits_lets_loaded |= 1 << ((*words[n_words].adr)[0] - 'a');
/* printf ("Loaded the %.1s's.\n", words[n_words-1].adr); */
}
} else
{
btab[(*words[n_words].adr)[0] - 'a'] = n_words;
bits_lets_loaded |= 1 << ((*words[n_words].adr)[0] - 'a');
}
n_words++;
next:;
}
/* printf ("Loaded the %.1s's.\n", words[n_words-1].adr); */
printf ("Kept %d words, discarded %d words.\n", n_words, n_discarded);
}
try_next_word:
if ((bits_lets_used & words[word_index].len) != 0) goto dont_use_word;
bits_lets_used |= words[word_index].len;
good_word:
words_used[n_words_used++] = word_index;
if ((bits_lets_used & vowel_mask) != vowel_mask) goto add_a_word;
{
char n_lets_used =
bits[(*p_lets_used)[0]] + bits[(*p_lets_used)[1]] +
bits[(*p_lets_used)[2]] + bits[(*p_lets_used)[3]];
if (n_lets_used < max_lets_used) goto backup;
for (i = 0; i < n_words_used; i++)
printf (" %s", words[words_used[i]].adr);
printf ("\n");
max_lets_used = n_lets_used;
goto backup;
}
un_word:
bits_lets_used ^= words[word_index].len;
dont_use_word:
add_a_word:
if (++word_index >= n_words) goto backup;
{
int let_slot = (*words[word_index].adr)[0] - 'a';
if ((bits_lets_used & (1 << let_slot)) != 0)
{
for (i = 1 + let_slot; i <= 25; i++)
if (btab[i] >= 0 && (bits_lets_used & (1 << i)) == 0)
{word_index = btab[i]; goto got_slot;}
goto backup;
}
got_slot:;
}
goto try_next_word;
backup:
if (n_words_used < 1) {printf ("Done.\n"); exit (1);};
n_words_used--;
word_index = words_used[n_words_used];
goto un_word;
}
|
448.2 | D.V.Pike threw J.Q.Schwartz my box. | LATOUR::AMARTIN | Alan H. Martin | Mon Mar 03 1986 19:48 | 17 |
| Re .0:
The written description of your program doesn't seem to contain any
hint of discarding sequences of words that are not legal sentences.
This leads me to believe that the program will produce output which
is claimed to be a sentence, but which isn't.
If you had a sample grammar for sentences, you might be able to
dramatically increase the program's efficiency. (Or decrease it).
Also, since my Ripley's Big Book from the '39 World's Fair has two separate
English sentences which use all 26 letters once, there is no correct
answer to the stated problem, which assumes there is only one such sentence.
You might as well quit while you're ahead.
No, I don't have the book with me, but I remember one of the two.
/AHM
|
448.3 | Not intended to be a sentence | LYRA::THALLER | Kurt (Tex) Thaller | Mon Mar 03 1986 22:35 | 12 |
| re. .2,
The original note states that the search is not trying to look for
a sentence, but merely a set of words spanning the 26 letters.
This sounds intriguing. If I get a chance I'll look into improving
the algorithm a bit.
p.s. Where did you get the 80000 word dictionary? Can I have access
to it?
Kurt*
|
448.4 | | CLT::GILBERT | Juggler of Noterdom | Tue Mar 04 1986 03:09 | 36 |
| This is the 'cover' problem, which I suspect is NP-complete.
This doesn't mean that the particular problem is insurmountable.
Consider the 2000 words as an an array of bit-vectors:
adze 10011000000000000000000001
pencil 00101000100101010000000000
...
quack 10100000001000001000100000
Now, permute the columns judiciously, and sort the numbers in descending
order. The 'judicious permutation' might place 'q' in the most significant
place, followed by 'u', then some other strange consonants (perhaps some
more vowels should be next?). The idea is that the recursive program that
searches for a solution would sequence through the list looking for a word
that could be 'played', and the earlier words would tend to disallow many
of the following words. Thus, the 'depth' would not become too great, and
the program would proceed faster and faster. The idea behing putting the
'q's first is that most of these words consume two vowels.
As an alternative, consider this. There are five vowels, and 21 consonants.
Now any complete solution must have a 'vowel ratio' of 21/5, so some word
in the solution must have 'vowel ratio' >= 21/5. Sort the list of possible
words into descending order by their 'vowel ratio'. The recursive step
involves determining the 'vowel ratio' that must be given by the remaining
words, and trying each of the words, starting at the place in the list
that has that 'vowel ratio'.
And a third approach. There are 2^26 combinations of 26 bits.
Representing each of these as a bit will take 2^(26-3-9) pages of memory,
or about 32K pages (assuming that the 'q' word in any solution will contain
a 'u' will cut this in half, making it workable; a similar technique could
be used in any of the other approaches, reducing the word list). Now,
determine what sets of letters acn be 'gotten', by adding words one at a
time [as a slight aid, you should permute the least-frequently used letters
to the low-order bits of the bit index].
|
448.5 | Stop Working on the Problem When You Have the Answer | BEING::POSTPISCHIL | Always mount a scratch monkey. | Tue Mar 04 1986 09:11 | 6 |
| Re .2:
Well, don't just sit there, type in the sentence you remember! :-)
-- edp
|
448.6 | Seven Vowels | BEING::POSTPISCHIL | Always mount a scratch monkey. | Tue Mar 04 1986 09:51 | 6 |
| Re .4:
Don't forget that w and y can be vowels (as in "cwm").
-- edp
|
448.7 | no, you can't start at that place | AVANTI::OSMAN | | Tue Mar 04 1986 10:25 | 13 |
| > in the solution must have 'vowel ratio' >= 21/5. Sort the list of possible
>words into descending order by their 'vowel ratio'. The recursive step
>involves determining the 'vowel ratio' that must be given by the remaining
>words, and trying each of the words, starting at the place in the list
>that has that 'vowel ratio'.
This doesn't quite sound right. If we start at the place that has
a vowel ratio equal to the demanded ratio for what's remaining,
we're skipping possible combinations of, let's say one word with
a LARGE ratio averaged with one with a SMALL ratio. (We'll skip
the large one, which might be part of a solution).
/Eric
|
448.8 | | CLT::GILBERT | Juggler of Noterdom | Tue Mar 04 1986 12:45 | 8 |
| Oops, right. Instead, realizing that you must include some word with
a high vowel ratio, you can start at the top of the list, and give up
when you reach a word that doesn't have the needed vowel ratio.
The fact that some words don't contain 'a','e','i','o' or 'u' complicates
matters only slightly. Split the problem into two search spaces: assuming
no such words in the solution, or assuming there is such a word in the
solution.
|
448.9 | Oops | GALLO::AMARTIN | Alan H. Martin | Tue Mar 04 1986 19:39 | 5 |
| Re .5:
I just noticed .2 had two "E"'s. Maybe the original used some archaic
spelling of "threw".
/AHM/SIGH
|
448.10 | Maybe twice is easier | METOO::YARBROUGH | | Wed Mar 05 1986 08:31 | 5 |
| Maybe it would be easier (and perhaps more fun) to tackle this variant
of the problem: Produce an English sentence in which every letter
appears exactly TWICE.
No, I don't have a solution.
|
448.11 | y | AVANTI::OSMAN | Eric, Maynard Ma. USA, DTN 223-6664 | Wed Mar 05 1986 11:07 | 6 |
| In answer to earlier request, yes, you're welcome to copy my words
files and play with them. They are
raynal::user$7:[osman.words]%z.txt.
/Eric
|
448.12 | Trinary, anyone? | LATOUR::AMARTIN | Alan H. Martin | Wed Mar 05 1986 11:10 | 6 |
| Re .10:
Unless have a trinary computer you want to share with us, your modification
of the problem presents an interesting challange - it basically rules
out bitvector-based solutions.
/AHM
|
448.13 | bitvectors still possible for double letters | AVANTI::OSMAN | Eric, Maynard Ma. USA, DTN 223-6664 | Thu Mar 06 1986 10:08 | 13 |
| No, you needn't rule out bitvector-based solutions for solving the
double-letter version.
One possible approach is to use a VAX quadword for representing
the bitvector for each word. You'd use 52 bits of the quadword
to map into a-z appearing once and a-z appearing again.
So, for instance, the word "apple" could be represented as
10001000000100010000000000 00000000000000010000000000
a e l p p
/Eric
|
448.14 | | CLT::GILBERT | Juggler of Noterdom | Thu Mar 06 1986 19:52 | 41 |
| I did some playing with the 'shortened' word-list I got from Eric
(this is over 15000 words). Each of these contains at least one of
the 6 vowels: aeiouy, and all the q-words contain at least 2 vowels.
The best 'ratio' between consonants and vowels is for schmaltz (7/1),
and there are 4 others with a ratio of 6/1. Note that the two vowels
used in a q-word don't get us many consonants (5 was the maximum, for
squinch or squelch).
Now there are 246 words containing a 'q'. This is smaller than the
number for 'j' or 'z'. Now a reasonable way to do a search might
be the following: Try each of the 'q' words, and see what words
remain; for example, we choose 'squelch' and find that 1143 words
are still 'playable', and that there are only 36 j-words (smallest).
So, we loop through the j-words, and so on.
Instead of looping through the j-words above, we could've checked
the consonant/vowel ratio, and used that to determine the next set
of words to loop over. For example, if we first choose 'quince',
we find we still need 17/3 for the remaining ratio. This means
that *some* other word in the solution must have a ratio of 6/1
or 12/2 or 17/3 (or better). However, we know there are only 5
words with such a high ratio. Since this is less that the number
of j-words or x-words, we loop over these. Doing so, they are
quickly exhausted, since they all contain 'sch', and 'quince' has
already used the 'c'.
So, after choosing a word, we disallow words containing those letters
(it's probably best to make a copy of those words that pass), check
that all the remaining letters have some representation on this
list, and see which letter has the fewest representatives. Of this
list, we also see which words have a sufficient consonant/vowel
ratio to help provide a solution. Whichever set is smaller, loop
over that set next.
One thing that should be done -- the list of words can be reduced
somewhat by not considering words that are easily formed from two
(or more) other words. For example, 'flanges' can be produced from
'fan' and 'legs', and so can be removed, thereby shortening the list.
- Gilbert
|
448.15 | tradeoffs ? | AVANTI::OSMAN | Eric, Maynard Ma. USA, DTN 223-6664 | Mon Mar 10 1986 10:42 | 10 |
| Yes, I thought of the idea of removing words that were conglomerates
of two or more smaller words. It didn't seem to me many would be
removed. What do you think ?
As for you methods of calculating what order to try things in,
it's not clear to me that the time you spend calculating what
order to try things in will be less than the time you'd spend
actually trying things in a less calculated order.
/Eric
|
448.16 | | CLT::GILBERT | Juggler of Noterdom | Tue Apr 08 1986 01:30 | 1 |
| Well, has anyone programmed this yet?
|
448.17 | D.V.Pike thru J.Q.Schwartz my box | DENTON::AMARTIN | Alan H. Martin | Mon May 04 1987 01:25 | 9 |
| Re .3:
Ah, rereading .0, I agree that we're not talking about generating sentences
here.
Re .9:
In retrospect, it isn't hard to guess what spelling of "threw" was used.
/AHM/THX
|