代码之家  ›  专栏  ›  技术社区  ›  Margus

如何建立单词表

  •  0
  • Margus  · 技术社区  · 14 年前

    所以现在我想制作爱沙尼亚语单词表~大约20万个小写的独特单词。要获取单词表的输入, corpus of Estonian 可以使用。语料库文件采用文本编码主动(TEI)格式。我试着用regex查找单词。

    这就是我所做的:它效率低下,MCV混乱不堪,如果单词的哈希集无法放入内存,它就会停止工作,它不知道输入编码-因此可能是字母make problems,它不显示估计的完成时间,某些控件有默认名称,而有些不知道,它不使用m。Ultitasking(不确定是否应该这样做),它使用了一些奇怪的修复和大量的锁定接口,这样它就不会“冻结”。至少它这么短,以至于你几乎没有注意到没有评论。

    好处是,它几乎可以阅读单词而不出错,从.tei,.txt,.csv,smgl,xhtml或任何类似的格式输入。

    现在你知道我想做什么,我是如何尝试做的(有什么问题),而且我只是想知道如何做(用最少的体力劳动)。

    图像示例:

    alt text

    代码示例& Gui :

    using System;
    using System.Collections.Generic;
    using System.ComponentModel;
    using System.Data;
    using System.Drawing;
    using System.Linq;
    using System.Text;
    using System.Windows.Forms;
    using System.Data.SqlClient;
    using System.IO;
    using System.Text.RegularExpressions;
    
    namespace Reader
    {
        public partial class Form1 : Form
        {
            public Form1()
            {
                InitializeComponent();
            }
    
    
            private void listView1_DragEnter(object sender, DragEventArgs e)
            {
                if (e.Data.GetDataPresent(DataFormats.FileDrop, false) == true)
                {
                    e.Effect = DragDropEffects.All;
                }  
            }
    
            private void listView1_DragDrop(object sender, DragEventArgs e)
            {
                setguiLock(true);
                this.loading.Visible = true;
                ignorechecking = true;
                string[] files = (string[])e.Data.GetData(DataFormats.FileDrop, false);
                Dictionary<String, ListViewGroup> listviewgroups = new Dictionary<string,ListViewGroup>();
    
                int filenamesi = 0;
    
                foreach (string file in files)
                {
                    progresslabel.Text = string.Format("Progress: \t[ {0} / {1} ]", filenamesi++, files.Length);
                    Application.DoEvents();
                   if (File.Exists(file))
                    {
                        FileInfo ff = new System.IO.FileInfo(file);
                        if (!listviewgroups.ContainsKey(ff.DirectoryName))
                        {
                            listviewgroups.Add(ff.DirectoryName, new ListViewGroup(ff.DirectoryName, HorizontalAlignment.Left));
                            listView1.Groups.Add(listviewgroups[ff.DirectoryName]);
                        }
                        ListViewItem item = new ListViewItem(ff.Name);
                        listviewgroups[ff.DirectoryName].Items.Add(item);
                        item.Checked = true;
    
                        item.SubItems.Add("" +((int)ff.Length/1024)+" KB");
    
                      //  item.Group.Header = ff.DirectoryName;
                      //  listviewgroups[ff.DirectoryName].Items.Add(item);
                        listView1.Items.Add(item);
                   }
                }
                setguiLock(false);
                ignorechecking = false;
                this.loading.Visible = false;
                updatechecked();
            }
    
            private void listView1_ItemChecked(object sender, ItemCheckedEventArgs e)
            {
                updatechecked();
            }
            private bool ignorechecking = false;
            private void updatechecked(){
                if (ignorechecking)
                    return;
                long size = 0;
                int count = 0;
                foreach (ListViewItem item in this.listView1.Items)
                {
                    if (item.Checked)
                    {
                        count++;
                        size += Int32.Parse(item.SubItems[1].Text.Split(" ".ToArray())[0]); 
                    }
                }
                this.text1.Text = ""+count;
                this.text2.Text = ""+size + " KB";
            }
            private void putHashset(HashSet<string> d, string filename)
            {
                StringBuilder sb = new StringBuilder();
                foreach (string key in d) 
                    sb.Append(key).Append("\n");
    
                File.WriteAllText(filename, sb.ToString());
            }
            private HashSet<string> getHashset(string filename)
            {
                return new HashSet<string>(new Regex("\\n+").Split(File.ReadAllText(filename)));
            }
    
            private void removefilefromlistview(string fullfilename) {
                foreach (ListViewItem item in this.listView1.Items)
                {
                    String file = item.Group.Header + "\\" + item.SubItems[0].Text;
                    if (fullfilename.CompareTo(file) == 0)
                    {
                        item.Checked = false;
                        this.listView1.Items.Remove(item);
                    }
                }
            }
            private void starter(object sender, EventArgs e)
            {
                HashSet<string> filenames = new HashSet<string>();
                StringBuilder data = null;
    
                setguiLock(true);
                this.time2.Text = "";
                this.time1.Text = String.Format("{0:d/M/yyyy HH:mm:ss}", DateTime.Now);
    
                foreach (ListViewItem item in this.listView1.Items) {
                    if (item.Checked) {
                        String file = item.Group.Header + "\\" + item.SubItems[0].Text;
                        if (File.Exists(file))
                            filenames.Add(file);
                    }
                }
    
                string outputfile = output.Text;
                HashSet<string> words = null;
                if (File.Exists(output.Text))
                    words = getHashset(outputfile);
                else
                    words = new HashSet<string>();
    
                int filenamesnr = filenames.Count;
                int filenamesi = 0;
                foreach (String str in filenames){
                    progresslabel.Text = string.Format("Progress: \t[ {0} / {1} ]", filenamesi++, filenamesnr);
                    Application.DoEvents();
                    data = new StringBuilder(System.IO.File.ReadAllText(str, Encoding.UTF7).ToLower());
    
                    data = data.Replace("&auml;", "ä");
                    data = data.Replace("&ouml;", "ö");
                    data = data.Replace("&uuml;", "ü");
                    data = data.Replace("&otilde;", "õ");
    
                    String sdata = new Regex(@"<(.|\n)*?>|%[a-zA-Z0-9]+?;|&[#a-zA-Z0-9]+?;").Replace(data.ToString(), "");
    
                    foreach (string word in new Regex("[^A-Za-zšžõäöüŠŽÕÄÖÜ]+").Split(sdata))
                        if(word.Length>1)
                                words.Add(word);
    
                    removefilefromlistview(str);
                }
                progresslabel.Text = "Progress:";
                putHashset(words, outputfile);
    
                foreach (ListViewItem item in this.listView1.Items)
                    if (item.Checked)
                    {
                        item.Checked = false;
                        listView1.Items.Remove(item);
                    }
    
                this.time2.Text = String.Format("{0:d/M/yyyy HH:mm:ss}", DateTime.Now);
                setguiLock(false);
            }
    
            private void setguiLock(bool value){
                if(value){
                    this.Enabled = false;
                    this.button1.Enabled = false;
                    this.listView1.Enabled = false;
                    this.output.Enabled = false;
                    this.openoutput.Enabled = false;
                    this.progresslabel.Visible = true;
                    this.Enabled = true;
                }else{
                    this.Enabled = false;
                    this.openoutput.Enabled = true;
                    this.output.Enabled = true;
                    this.listView1.Enabled = true;
                    this.button1.Enabled = true;
                    this.progresslabel.Visible = false;
                    this.Enabled = true;
                }
            }
    
            private void button2_Click(object sender, EventArgs e)
            {
                if (!File.Exists(output.Text))
                       File.WriteAllText(output.Text, " ");
                System.Diagnostics.Process.Start(output.Text);
            }
        }
    }
    
    1 回复  |  直到 13 年前
        1
  •  1
  •   Lou    14 年前

    你需要为工作找到合适的工具。在这样的语言语料库中,数据和标记的数量意味着您需要一个适当的支持XML的索引解决方案。示例包括exist、xaira、cqp…