代码之家  ›  专栏  ›  技术社区  ›  Charles Okwuagwu

如何用另一个[elixir/erlang]替换字符串中的第n个字符

  •  1
  • Charles Okwuagwu  · 技术社区  · 6 年前

    如何执行字符串 replace_at 用长生不老药还是二郎药?

    例如,给定此固定宽度文件:

    EmployeeFundMappingID EmployeeID  FundID      IsActive EntryDate               ExitDate                ExitTypeID  DateCreated             CreatedByID DateModified            ModifiedByID ConfirmedBy DateConfirmed           GUID                                     IsPooled DatePooled
    1                     1118544     1           1        2009-04-20 00:00:00.000 NULL                    NULL        2014-05-17 08:46:48.020 1           2014-10-30 13:34:47.177 NULL         1           2009-04-20 17:48:12.067 NULL                                     NULL     NULL
    2                     1027350     1           1        2008-03-03 00:00:00.000 NULL                    NULL        2014-05-17 08:46:48.020 1           2014-10-30 13:34:47.177 NULL         1           2008-05-04 15:13:30.303 NULL                                     NULL     NULL
    3                     1024795     1           1        2008-02-29 00:00:00.000 NULL                    NULL        2014-05-17 08:46:48.020 1           2014-10-30 13:34:47.177 NULL         1           2008-05-04 15:13:30.303 NULL                                     NULL     NULL
    4                     1116497     1           1        2009-03-24 00:00:00.000 NULL                    NULL        2014-05-17 08:46:48.020 1           2014-10-30 13:34:47.177 NULL         1           2009-03-24 13:00:15.277 NULL                                     NULL     NULL
    5                     1116569     1           1        2009-03-24 00:00:00.000 NULL                    NULL        2014-05-17 08:46:48.020 1           2014-10-30 13:34:47.177 NULL         1           2009-03-24 14:43:08.280 NULL                                     NULL     NULL
    6                     1116920     1           1        2009-03-27 00:00:00.000 NULL                    NULL        2014-05-17 08:46:48.020 1           2014-10-30 13:34:47.177 NULL         1           2009-03-27 17:16:35.073 NULL                                     NULL     NULL
    

    列位置在:

    [0, 22, 34, 46, 55, 79, 103, 115, 139, 151, 175, 188, 200, 224, 265, 274]

    如何替换 \s 具有 \t 在每个col位置?

    我在努力改变 Fixed-Width 文件转换成 csv

    3 回复  |  直到 6 年前
        1
  •  3
  •   Charles Okwuagwu    6 年前

    我将使用一组改变字符串中各自位置的函数来减少原始行。

    funs =
      [22, 34, 46, 55, 79, 103, 115, 139, 151, 175, 188, 200, 224, 265, 274]
      |> Enum.map(& &1 - 1)
      |> Enum.map(fn len ->
           fn <<s :: binary-size(len), " ", rest :: binary>> ->
             s <> "\t" <> rest
           end
         end)
    
    input
    |> String.trim
    |> String.split("\n")
    |> Enum.map(fn line ->
         Enum.reduce(funs, line, fn fun, acc -> fun.(acc) end)
       end)
    

    使用生成的宏(每个位置一次)和递归调用可以更优雅地完成这项工作,但是在这里减少函数列表对我来说更简单。


    这种方法的优点是,它在任何不一致的数据上立即失败,保证(或多或少)如果它通过,转换就完成了,不像所有其他较短的解决方案。

    而且它的速度比任何 Regex 解决方案。


    因为这将应用于16M行,所以这里可能是性能最好的版本,它可以同时匹配整行:

    input
    |> String.trim
    |> String.split("\n")
    |> Enum.map(
         # [22, 34, 46, 55, 79, 103,
         #  115, 139, 151, 175, 188,
         #  200, 224, 265, 274]
         # note: this assumes the listed positions above are 1-based
         fn <<
            c1 :: binary-size(21),
            " ",
            c2 :: binary-size(11),
            " ",
            c3 :: binary-size(11),
            " ",
            c4 :: binary-size(8),
            " ",
            c5 :: binary-size(23),
            " ",
            c6 :: binary-size(23),
            " ",
            c7 :: binary-size(11),
            " ",
            c8 :: binary-size(23),
            " ",
            c9 :: binary-size(11),
            " ",
            c10 :: binary-size(23),
            " ",
            c11 :: binary-size(12),
            " ",
            c12 :: binary-size(11),
            " ",
            c13 :: binary-size(23),
            " ",
            c14 :: binary-size(40),
            " ",
            c15 :: binary-size(8),
            " ",
            c16 :: binary
            >> ->
         c1 <> "\t" <> 
           c2 <> "\t" <> 
           c3 <> "\t" <> 
           c4 <> "\t" <> 
           c5 <> "\t" <> 
           c6 <> "\t" <> 
           c7 <> "\t" <> 
           c8 <> "\t" <> 
           c9 <> "\t" <> 
           c10 <> "\t" <> 
           c11 <> "\t" <> 
           c12 <> "\t" <> 
           c13 <> "\t" <> 
           c14 <> "\t" <> 
           c15 <> "\t" <> 
           c16
       end)
    
        2
  •  1
  •   Abhyudit Jain    6 年前

    您可以先加入日期时间,然后用逗号替换所有空间,然后将日期时间恢复为原始格式:

    data
    |> String.replace(~r/(-\d+)([\s]{1})(\d+)/, "\\1T\\3")
    |> String.replace(~r/ +/, ",")
    |> String.replace(~r/(\d)(T)(\d)/, "\\1 \\3")
    
        3
  •  0
  •   Charles Okwuagwu    6 年前

    在超过16M行的数据集上比较两种实现:

      def flat2csv1(src, dst) do
        Logger.info("START")
    
        t = System.system_time(:millisecond)
    
        funs =
          [12, 52, 76]
          |> Enum.map(&(&1 - 1))
          |> Enum.map(fn len ->
            fn <<s::binary-size(len), " ", rest::binary>> ->
              s <> "\t" <> rest
            end
          end)
    
        File.stream!(src)
        |> Enum.map(fn line ->
          Enum.reduce(funs, line, fn fun, acc -> fun.(acc) end)
        end)
        |> write(dst)
    
        log_elapsed("DONE", t)
      end
    
      def flat2csv0(src, dst) do
        Logger.info("START")
    
        t = System.system_time(:millisecond)
    
        File.stream!(src)
        |> Enum.map(fn <<
                         c1::binary-size(11),
                         " ",
                         c2::binary-size(39),
                         " ",
                         c3::binary-size(23),
                         " ",
                         ce::binary
                       >> ->
          c1 <> "\t" <> c2 <> "\t" <> c3 <> "\t" <> ce
        end)
        |> write(dst)
    
        log_elapsed("DONE", t)
      end
    
      defp log_elapsed(s, t) do
        t = System.system_time(:millisecond) - t
        Logger.debug("#{s}: #{t} ms")
      end
    
      defp write(s, dst) do
        File.write!(dst, s, [:append])
      end
    

    结果

    # flat2csv0
    11:40:25.055 [info] START
    11:42:26.028 [info] DONE: 120969 ms
    
    # flat2csv1
    11:45:17.521 [info] START
    11:48:25.433 [info] DONE: 187906 ms