代码之家  ›  专栏  ›  技术社区  ›  Kyle Banerjee

使用awk将每行一个字段的标记数据转换为表

awk
  •  -1
  • Kyle Banerjee  · 技术社区  · 7 年前

    我有一个数据文件,其中每个字段位于单独的行上,如下所示。记录中出现的特定字段各不相同,因此我不能使用任何基本上连接字段而不知道它们是什么的解决方案

    输入样本

    Creator=Burroughs Wellcome and Company
    Date=ca. 1906
    Description=Blue cardboard box, measuring 5.5 cm x 4.3 cm x 2.2 cm. Box in fair condition.
    Identifier=77-97.1.3a
    DOI=doi:10.6083/M4H41PRC
    Medium=Cardboard
    Relation=References 77-97.1.3b.jpg
    Rights=COPYRIGHT NOT EVALUATED 
    Source=Medical Museum Collection, Box 1
    Subject=Vaporole;;;Epinine;;;Deoxyepinephrine;;;Pharmaceutical Preparations
    Title=Box containing medicine vials
    Type=Still Image
    collection=2
    filename=df0968b22c1072c8909538c516dc81b6.jpg
    id=10959
    
    Date=ca. 1906
    Description=Two stemmed amber glass vials in a blue cardboard box. 
    Identifier=77-97.1.3b
    DOI=doi:10.6083/M4CC0Z0M
    Medium=Glass;;;Cardboard
    Relation=IsPartOf 77-97.1.3a.jpg
    Rights=COPYRIGHT NOT EVALUATED
    Source=Medical Museum Collection, Box 1
    Subject=Vials;;;Vaporole;;;Epinine;;;Deoxyepinephrine;;;Pharmaceutical Preparations
    Title=Medicine vials in a box
    Type=Still Image
    collection=2
    filename=9e846a60d8a79de37e91279696e520e6.jpg
    id=10960
    

    我需要将其转换为带分隔符的文件。由于字段可能存在,也可能不存在,因此我需要为永久记录枚举列,例如标题、创建者、日期、标识符等。

    1 回复  |  直到 7 年前
        1
  •  0
  •   Ed Morton    7 年前

    您没有提供示例输出,因此这只是猜测,但这可能是您想要的:

    $ cat tst.awk
    BEGIN {
        RS   = ""
        FS   = "\n"
        OFS  = ","
        ofmt = "\"%s\"%s"
    }
    NR == FNR {
        for (i=1; i<=NF; i++) {
            name = $i
            sub(/=.*/,"",name)
            if ( !seen[name]++ ) {
                nr2name[++numNames] = name
            }
        }
        next
    }
    FNR == 1 {
        for (nameNr=1; nameNr<=numNames; nameNr++) {
            name = nr2name[nameNr]
            printf ofmt, name, (nameNr<numNames ? OFS : ORS)
        }
    }
    {
        delete name2val
        for (fldNr=1; fldNr<=NF; fldNr++) {
            name = val = $fldNr
            sub(/=.*/,"",name)
            sub(/[^=]+=/,"",val)
            name2val[name] = val
        }
    
        for (nameNr=1; nameNr<=numNames; nameNr++) {
            name = nr2name[nameNr]
            val  = name2val[name]
            printf ofmt, val, (nameNr<numNames ? OFS : ORS)
        }
    }
    

    $ awk -f tst.awk file file
    "Creator","Date","Description","Identifier","DOI","Medium","Relation","Rights","Source","Subject","Title","Type","collection","filename","id"
    "Burroughs Wellcome and Company","ca. 1906","Blue cardboard box, measuring 5.5 cm x 4.3 cm x 2.2 cm. Box in fair condition.","77-97.1.3a","doi:10.6083/M4H41PRC","Cardboard","References 77-97.1.3b.jpg","COPYRIGHT NOT EVALUATED ","Medical Museum Collection, Box 1","Vaporole;;;Epinine;;;Deoxyepinephrine;;;Pharmaceutical Preparations","Box containing medicine vials","Still Image","2","df0968b22c1072c8909538c516dc81b6.jpg","10959"
    "","ca. 1906","Two stemmed amber glass vials in a blue cardboard box. ","77-97.1.3b","doi:10.6083/M4CC0Z0M","Glass;;;Cardboard","IsPartOf 77-97.1.3a.jpg","COPYRIGHT NOT EVALUATED","Medical Museum Collection, Box 1","Vials;;;Vaporole;;;Epinine;;;Deoxyepinephrine;;;Pharmaceutical Preparations","Medicine vials in a box","Still Image","2","9e846a60d8a79de37e91279696e520e6.jpg","10960"